From 05d1c8ca2dc6e4aa89d0f78b4eccd73e49869e8a Mon Sep 17 00:00:00 2001 From: Zhongze Li Date: Thu, 10 Jul 2025 22:30:17 -0500 Subject: [PATCH 1/2] Fix numVgprs issue when LDSTrInst is enabled. Add an assertation to avoid using LDSTr and multiple LDS buffer for HSS or BSS --- projects/hipblaslt/tensilelite/Tensile/KernelWriter.py | 4 ++++ .../hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py | 4 ++++ .../hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index 657324b03d9..2a173cac447 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -4265,6 +4265,8 @@ def readWriteVectors(mat, vw, kernel): numVgprValuPackA *= 2 else: numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) + if kernel["enableLDSTrA"]: + numVgprValuPackA = 2*kernel["InnerUnroll"]*kernel["MIWaveTileA"] vgprIdx += numVgprValuPackA self.states.a.startVgprG2L = None if not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]: @@ -4300,6 +4302,8 @@ def readWriteVectors(mat, vw, kernel): numVgprValuPackB *= 2 else: numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) + if kernel["enableLDSTrB"]: + numVgprValuPackB = 2*kernel["InnerUnroll"]*kernel["MIWaveTileB"] vgprIdx += numVgprValuPackB self.states.b.startVgprG2L = None if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index e5547926b4c..7e7cde89ab8 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -4558,6 +4558,8 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackA *= 2 else: numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) + if kernel["enableLDSTrA"]: + numVgprValuPackA = 2*kernel["InnerUnroll"]*kernel["MIWaveTileA"] vgprBaseA = self.vgprPool.checkOutAligned(numValuA + numVgprValuPackA, 2) imodA.add(RegSet("v", "vgprValuA_X0_I0_BASE", vgprBaseA)) imodA.add(self.moduleVgprMacroValuA) @@ -4579,6 +4581,8 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackB *= 2 else: numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) + if kernel["enableLDSTrB"]: + numVgprValuPackB = 2*kernel["InnerUnroll"]*kernel["MIWaveTileB"] vgprBaseB = self.vgprPool.checkOutAligned(numValuB + numVgprValuPackB, 2) imodB.add(RegSet("v", "vgprValuB_X0_I0_BASE", vgprBaseB)) imodB.add(self.moduleVgprMacroValuB) diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index 1f381fca536..e449847b64f 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -1298,6 +1298,10 @@ def assignDerivedParameters( if state["enableLDSTrB"]: state["VectorWidthB"] = 1 + if (state["enableLDSTrA"] or state["enableLDSTrB"]) and (state["1LDSBuffer"] == 0) and (state["ProblemType"]["DataType"] != state["ProblemType"]["DestDataType"]): + reject(state, printRejectionReason, "LDSTrInst cannot support multiple LDS buffer when input datatype is not the same as destination datatype") + return + # if state["EnableMatrixInstruction"] and not state["SourceSwap"] and (state["VectorWidthA"] > 1 or state["VectorWidthB"] > 1): # reject(state, printRejectionReason, "not implement VectorWidth without SourceSwap") From 968ddc657c509aa3c1b0bd52aae9bc9382ee0a60 Mon Sep 17 00:00:00 2001 From: Zhongze Li Date: Tue, 15 Jul 2025 23:49:16 -0500 Subject: [PATCH 2/2] Fix numVgprs issue when LDSTrInst is enabled --- projects/hipblaslt/tensilelite/Tensile/KernelWriter.py | 8 ++------ .../hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py | 8 ++------ .../tensilelite/Tensile/SolutionStructs/Solution.py | 4 ---- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index 2a173cac447..382b915054e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -4256,7 +4256,7 @@ def readWriteVectors(mat, vw, kernel): self.states.startVgpr = vgprIdx vgprIdx += self.states.a.numVgprValu numVgprValuPackA = 0 - if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]: + if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: self.states.a.startVgprValuPack = vgprIdx if self.states.lrvwTileA > 1: numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] @@ -4265,8 +4265,6 @@ def readWriteVectors(mat, vw, kernel): numVgprValuPackA *= 2 else: numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) - if kernel["enableLDSTrA"]: - numVgprValuPackA = 2*kernel["InnerUnroll"]*kernel["MIWaveTileA"] vgprIdx += numVgprValuPackA self.states.a.startVgprG2L = None if not kernel["DirectToLdsA"] or self.do["KeepDirectToLdsAlloc"]: @@ -4293,7 +4291,7 @@ def readWriteVectors(mat, vw, kernel): self.states.b.startVgprValu = vgprIdx vgprIdx += self.states.b.numVgprValu numVgprValuPackB = 0 - if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]: + if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: self.states.b.startVgprValuPack = vgprIdx if self.states.lrvwTileB > 1: numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] @@ -4302,8 +4300,6 @@ def readWriteVectors(mat, vw, kernel): numVgprValuPackB *= 2 else: numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) - if kernel["enableLDSTrB"]: - numVgprValuPackB = 2*kernel["InnerUnroll"]*kernel["MIWaveTileB"] vgprIdx += numVgprValuPackB self.states.b.startVgprG2L = None if not kernel["DirectToLdsB"] or self.do["KeepDirectToLdsAlloc"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index 7e7cde89ab8..5b67c373999 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -4550,7 +4550,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackA = 0 if self.states.a.numVgprValu > 0 and not kernel["DirectToVgprA"]: numValuA = self.states.a.numVgprValu - if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]: + if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: if self.states.lrvwTileA > 1: numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] if self.states.packDTVA: @@ -4558,8 +4558,6 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackA *= 2 else: numVgprValuPackA = self.states.a.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackA * (int(4/tensorParametersA["bpeDS"]) - 1) - if kernel["enableLDSTrA"]: - numVgprValuPackA = 2*kernel["InnerUnroll"]*kernel["MIWaveTileA"] vgprBaseA = self.vgprPool.checkOutAligned(numValuA + numVgprValuPackA, 2) imodA.add(RegSet("v", "vgprValuA_X0_I0_BASE", vgprBaseA)) imodA.add(self.moduleVgprMacroValuA) @@ -4573,7 +4571,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackB = 0 if self.states.b.numVgprValu > 0 and not kernel["DirectToVgprB"]: numValuB = self.states.b.numVgprValu - if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]: + if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: if self.states.lrvwTileB > 1: numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] if self.states.packDTVB: @@ -4581,8 +4579,6 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackB *= 2 else: numVgprValuPackB = self.states.b.numVgprValuPerBlock * kernel["InnerUnroll"] * self.states.numVgprBufferPackB * (int(4/tensorParametersB["bpeDS"]) - 1) - if kernel["enableLDSTrB"]: - numVgprValuPackB = 2*kernel["InnerUnroll"]*kernel["MIWaveTileB"] vgprBaseB = self.vgprPool.checkOutAligned(numValuB + numVgprValuPackB, 2) imodB.add(RegSet("v", "vgprValuB_X0_I0_BASE", vgprBaseB)) imodB.add(self.moduleVgprMacroValuB) diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index e449847b64f..1f381fca536 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -1298,10 +1298,6 @@ def assignDerivedParameters( if state["enableLDSTrB"]: state["VectorWidthB"] = 1 - if (state["enableLDSTrA"] or state["enableLDSTrB"]) and (state["1LDSBuffer"] == 0) and (state["ProblemType"]["DataType"] != state["ProblemType"]["DestDataType"]): - reject(state, printRejectionReason, "LDSTrInst cannot support multiple LDS buffer when input datatype is not the same as destination datatype") - return - # if state["EnableMatrixInstruction"] and not state["SourceSwap"] and (state["VectorWidthA"] > 1 or state["VectorWidthB"] > 1): # reject(state, printRejectionReason, "not implement VectorWidth without SourceSwap")