diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index d2bd35e8c67..a776016fad7 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -4257,7 +4257,7 @@ def readWriteVectors(mat, vw, kernel): self.states.startVgpr = vgprIdx vgprIdx += self.states.a.numVgprValu numVgprValuPackA = 0 - if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]: + if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: self.states.a.startVgprValuPack = vgprIdx if self.states.lrvwTileA > 1: numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] @@ -4292,7 +4292,7 @@ def readWriteVectors(mat, vw, kernel): self.states.b.startVgprValu = vgprIdx vgprIdx += self.states.b.numVgprValu numVgprValuPackB = 0 - if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]: + if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: self.states.b.startVgprValuPack = vgprIdx if self.states.lrvwTileB > 1: numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index 90e309efbaa..92b8772bfc0 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -4554,7 +4554,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackA = 0 if self.states.a.numVgprValu > 0 and not kernel["DirectToVgprA"]: numValuA = self.states.a.numVgprValu - if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]: + if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]: if self.states.lrvwTileA > 1: numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"] if self.states.packDTVA: @@ -4575,7 +4575,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te numVgprValuPackB = 0 if self.states.b.numVgprValu > 0 and not kernel["DirectToVgprB"]: numValuB = self.states.b.numVgprValu - if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]: + if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]: if self.states.lrvwTileB > 1: numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"] if self.states.packDTVB: