From 62bd644971191ec95aecbbfe289064b2827f4d0d Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" <210906412+assistant-librarian[bot]@users.noreply.github.com> Date: Thu, 3 Jul 2025 13:02:14 -0600 Subject: [PATCH] Changing sgpr limits (#304) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In regards to LWPTENSILE-1696 This includes 2 changes: - Unrestricted the temp sgprs needed for gsu from being contiguous, avoiding overflow for certain kernels - Account for additional temp sgprs that will be required for code gen, up to physical limits --- 🔁 Imported from [ROCm/hipBLASLt#2184](https://github.com/ROCm/hipBLASLt/pull/2184) 🧑‍💻 Originally authored by @mahmoodw --------- Co-authored-by: mahmoodw Co-authored-by: mahmoodw <44450175+mahmoodw@users.noreply.github.com> --- .../tensilelite/Tensile/Components/GSU.py | 33 ++++++++++++------- .../Tensile/KernelWriterAssembly.py | 4 ++- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/GSU.py b/projects/hipblaslt/tensilelite/Tensile/Components/GSU.py index d6cc3d6f8989..9f05fd805a13 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/GSU.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/GSU.py @@ -705,18 +705,27 @@ def GSUSynccodegen(self, writer, kernel, tmpVgpr, tmpVgprSize, tmpVgprDynamic, b indices = list(range(0, kernel["ProblemType"]["NumIndicesC"])) numDim = len(indices) - with writer.allocTmpSgpr(5) as tmpSgprInfo: - tmpSgpr = tmpSgprInfo.idx - module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr+0), sgpr(tmpSgpr+1), sgpr("SizesFree+0"), 1, tmpVgpr, "Free0")) - for i in range(1, numDim): - module.add(SSubU32(dst=sgpr(tmpSgpr+4), src0=sgpr("SizesFree+%u"%i), src1=1, comment="Free%u" % i)) - module.add(SMulI32(dst=sgpr(tmpSgpr+4), src0=sgpr(tmpSgpr+4), src1=1, comment="Free%u" % i)) - module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr+2), sgpr(tmpSgpr+3), sgpr(tmpSgpr+4), sgpr("StrideC%s"%writer.states.indexChars[i]), tmpVgpr, "Free%u" % i)) - module.add(SAddU32(dst=sgpr(tmpSgpr+0), src0=sgpr(tmpSgpr+0), src1=sgpr(tmpSgpr+2), comment="Free%u" % i)) - module.add(SAddCU32(dst=sgpr(tmpSgpr+1), src0=sgpr(tmpSgpr+1), src1=sgpr(tmpSgpr+3), comment="Free%u" % i)) - - bpetmp = int(writer.states.bpr * kernel["ProblemType"]["DestDataType"].numRegisters()) # self.states.bpeCinternal - module.add(SLShiftLeftB64(dst=sgpr(tmpS04,2), src=sgpr(tmpSgpr+0,2), shiftHex=log2(writer.states.bpeCexternal), comment="scale by bpe")) + tmpSgpr1 = writer.sgprPool.checkOut(2, preventOverflow=False) + tmpSgpr2 = writer.sgprPool.checkOut(1, preventOverflow=False) + tmpSgpr3 = writer.sgprPool.checkOut(1, preventOverflow=False) + tmpSgpr4 = writer.sgprPool.checkOut(1, preventOverflow=False) + + module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr1+0), sgpr(tmpSgpr1+1), sgpr("SizesFree+0"), 1, tmpVgpr, "Free0")) + + for i in range(1, numDim): + module.add(SSubU32(dst=sgpr(tmpSgpr4), src0=sgpr("SizesFree+%u" % i), src1=1, comment="Free%u" % i)) + module.add(SMulI32(dst=sgpr(tmpSgpr4), src0=sgpr(tmpSgpr4), src1=1, comment="Free%u" % i)) + module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr2), sgpr(tmpSgpr3), sgpr(tmpSgpr4), sgpr("StrideC%s" % writer.states.indexChars[i]), tmpVgpr, "Free%u" % i)) + module.add(SAddU32(dst=sgpr(tmpSgpr1+0), src0=sgpr(tmpSgpr1+0), src1=sgpr(tmpSgpr2), comment="Free%u" % i)) + module.add(SAddCU32(dst=sgpr(tmpSgpr1+1), src0=sgpr(tmpSgpr1+1), src1=sgpr(tmpSgpr3), comment="Free%u" % i)) + + bpetmp = int(writer.states.bpr * kernel["ProblemType"]["DestDataType"].numRegisters()) # self.states.bpeCinternal + module.add(SLShiftLeftB64(dst=sgpr(tmpS04, 2), src=sgpr(tmpSgpr1+0, 2), shiftHex=log2(writer.states.bpeCexternal), comment="scale by bpe")) + + writer.sgprPool.checkIn(tmpSgpr1) + writer.sgprPool.checkIn(tmpSgpr2) + writer.sgprPool.checkIn(tmpSgpr3) + writer.sgprPool.checkIn(tmpSgpr4) module.addSpaceLine() #####################################cal synchronizer sum start##################################### diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index b00241442f4d..f49d711e1230 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -11614,7 +11614,9 @@ def getMBSKGSUTotal(self, kernel): def setOccupancy(self, kernel): # Use VGPR up to next occupancy threshold: - maxVgprs, occupancy = self.getMaxRegsForOccupancy(kernel["NumThreads"], self.vgprPool.size(), self.sgprPool.size(), \ + # Account for additional temp sgprs that will be required for code gen, up to physical limits. +5 approximates upper end of required temp space for GSUSynccodegenOpt + requiredSgprs = min(self.sgprPool.size() + 5, self.states.regCaps["MaxSgpr"]) + maxVgprs, occupancy = self.getMaxRegsForOccupancy(kernel["NumThreads"], self.vgprPool.size(), requiredSgprs, \ self.getLdsSize(kernel), self.agprPool.size(), self.states.doubleVgpr) # Set occupancy limit for register pools # TODO: Support gfx12