Skip to content
4 changes: 2 additions & 2 deletions projects/hipblaslt/tensilelite/Tensile/KernelWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4257,7 +4257,7 @@ def readWriteVectors(mat, vw, kernel):
self.states.startVgpr = vgprIdx
vgprIdx += self.states.a.numVgprValu
numVgprValuPackA = 0
if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]:
if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]:
self.states.a.startVgprValuPack = vgprIdx
if self.states.lrvwTileA > 1:
numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"]
Expand Down Expand Up @@ -4292,7 +4292,7 @@ def readWriteVectors(mat, vw, kernel):
self.states.b.startVgprValu = vgprIdx
vgprIdx += self.states.b.numVgprValu
numVgprValuPackB = 0
if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]:
if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]:
self.states.b.startVgprValuPack = vgprIdx
if self.states.lrvwTileB > 1:
numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4554,7 +4554,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te
numVgprValuPackA = 0
if self.states.a.numVgprValu > 0 and not kernel["DirectToVgprA"]:
numValuA = self.states.a.numVgprValu
if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"]:
if tensorParametersA["bpe"] < 4 and not kernel["UnrollMajorLDSA"] and not kernel["enableLDSTrA"]:
if self.states.lrvwTileA > 1:
numVgprValuPackA = ceil(kernel["VectorWidthA"] * tensorParametersA["bpe"] / self.states.bpr) * kernel["MIWaveTileA"] // kernel["VectorWidthA"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadA"]
if self.states.packDTVA:
Expand All @@ -4575,7 +4575,7 @@ def tailLoopAllocValuVgpr(self, kernel, tensorParametersA, tensorParametersB, te
numVgprValuPackB = 0
if self.states.b.numVgprValu > 0 and not kernel["DirectToVgprB"]:
numValuB = self.states.b.numVgprValu
if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"]:
if tensorParametersB["bpe"] < 4 and not kernel["UnrollMajorLDSB"] and not kernel["enableLDSTrB"]:
if self.states.lrvwTileB > 1:
numVgprValuPackB = ceil(kernel["VectorWidthB"] * tensorParametersB["bpe"] / self.states.bpr) * kernel["MIWaveTileB"] // kernel["VectorWidthB"] * kernel["InnerUnroll"] * self.states.numVgprBuffer * kernel["MIInputPerThreadB"]
if self.states.packDTVB:
Expand Down
Loading