From d258c35769df51d7524e0abc5f02cc8572a929a0 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 17 Feb 2026 12:59:04 -0600 Subject: [PATCH 1/3] Fix some errors breaking non-mx tests on mx branch --- .../tensilelite/Tensile/KernelWriterAssembly.py | 6 +++--- .../tensilelite/Tensile/SolutionStructs/Solution.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index c4de12d1900..a4aea132c09 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -5518,8 +5518,8 @@ def generateReLoadLoop(tc): jumpLabel(tP, sLoadTileIdx, checkAddrLabel) imod.add(checkAddrLabel) imod.add(VSubU32(dst=vgpr(tmpVgpr), src0=vgpr(tmpVgpr), - src1=self.states.srdShiftLeft[tc] * tP["bpeGR"], comment="sub prepad")) - loadRangePerThread = tP["glvw"] * tP["bpeGR"] - 1 + src1=int(self.states.srdShiftLeft[tc] * tP["bpeGR"]), comment="sub prepad")) + loadRangePerThread = int(tP["glvw"] * tP["bpeGR"] - 1) imod.add(VAddU32(dst=vgpr(tmpVgpr+1), src0=vgpr(tmpVgpr), src1=loadRangePerThread, \ comment="Calculate load range per thread")) imod.add(VCmpLtI32(dst=sgpr(sCmpLoadStartAddrStatusx2, 2), src0=vgpr(tmpVgpr), \ @@ -8439,7 +8439,7 @@ def globalReadGuardKBody(tP, optParams = None): destVgpr="G2L%s+%u+%u"%(tc, g2lIdx + tP["shiftGR"] if not tP["isM"] else g2lIdxM, regIdx+eccOffset) self.vgprs.globalReadRegisters[tc].append( (g2lIdx + tP["shiftGR"] if not tP["isM"] else g2lIdxM) + regIdx+eccOffset) - offset = r * tP["bpeGR"] + instOffset + offset = int(r * tP["bpeGR"] + instOffset) comment = "load one buffer value" if (dataType.isHalf() or dataType.isBFloat16()) and not tP["isM"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index ae964d8cadf..ccc39e0eaf8 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -38,7 +38,8 @@ roundUpToNearestMultiple from Tensile.Common.DataType import DataType from Tensile.Common.GlobalParameters import defaultSolution, \ - defaultInternalSupportParams + defaultInternalSupportParams, \ + globalParameters from Tensile.SolutionStructs.Naming import getSolutionNameFull from Tensile.SolutionStructs.Problem import ProblemType from Tensile.Toolchain.Component import Assembler @@ -1716,7 +1717,7 @@ def depthUIteration( state["_staggerStrideShift"] = (int)(math.ceil(math.log(state["StaggerUStride"] / (state["DepthU"] * bpeA), 2))) - def calcLdsPad(lrvw: int, isaInfoMap: Dict[str, IsaInfo]) -> int: + def calcLdsPad(isaInfoMap: Dict[str, IsaInfo]) -> int: lrvwA = state["LocalReadVectorWidthA"] lrvwB = state["LocalReadVectorWidthB"] ldsPadA = state["LdsPadA"] @@ -2021,7 +2022,7 @@ def calcLdsNumBytes(ldsPadA: int, LdsBlockSizePerPadA: int, ldsPadB: int, LdsBlo wlrB = max(state["LocalReadVectorWidthB"] // state["MIInputPerThread"], 1) if (wlrA > 1) or (wlrB > 1): - padA, padB, padM = calcLdsPad() + padA, padB, padM = calcLdsPad(isaInfoMap) ldsBlockSizePerPadA, ldsBlockSizePerPadB = calcLdsBlockSizePerPad() ldsNumBytesA, ldsNumBytesAlignedA, ldsNumBytesB, ldsNumBytesAlignedB, ldsNumBytesMetadata, ldsNumBytesAlignedMetadata, \ ldsNumBytesMXSA, ldsNumBytesAlignedMXSA, ldsNumBytesMXSB, ldsNumBytesAlignedMXSB = calcLdsNumBytes(padA, ldsBlockSizePerPadA, padB, ldsBlockSizePerPadB) @@ -3073,7 +3074,7 @@ def subCheckLdsBlockSizePerPad(tc, idx): state["NoLdsWriteCode"] = True # calculate ldsPad - state["LdsPadA"], state["LdsPadB"], state["LdsPadMetadata"] = calcLdsPad(state["LocalReadVectorWidth"], isaInfoMap) + state["LdsPadA"], state["LdsPadB"], state["LdsPadMetadata"] = calcLdsPad(isaInfoMap) if state["GlobalReadVectorWidthA"] * state["ProblemType"]["MacDataTypeA"].numBytes() == 32 and state["LdsPadA"] == 16 // state["ProblemType"]["MacDataTypeA"].numBytes(): if auto_LdsBlockSizePerPadA_for_mix: From 5da8bf859b9f38e0596419b7e7a1401ab6a14d1a Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 17 Feb 2026 16:27:43 -0500 Subject: [PATCH 2/3] Fix MaxLDS source --- .../tensilelite/Tensile/SolutionStructs/Solution.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index ccc39e0eaf8..b6cd010a90c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -38,8 +38,7 @@ roundUpToNearestMultiple from Tensile.Common.DataType import DataType from Tensile.Common.GlobalParameters import defaultSolution, \ - defaultInternalSupportParams, \ - globalParameters + defaultInternalSupportParams from Tensile.SolutionStructs.Naming import getSolutionNameFull from Tensile.SolutionStructs.Problem import ProblemType from Tensile.Toolchain.Component import Assembler @@ -2026,7 +2025,7 @@ def calcLdsNumBytes(ldsPadA: int, LdsBlockSizePerPadA: int, ldsPadB: int, LdsBlo ldsBlockSizePerPadA, ldsBlockSizePerPadB = calcLdsBlockSizePerPad() ldsNumBytesA, ldsNumBytesAlignedA, ldsNumBytesB, ldsNumBytesAlignedB, ldsNumBytesMetadata, ldsNumBytesAlignedMetadata, \ ldsNumBytesMXSA, ldsNumBytesAlignedMXSA, ldsNumBytesMXSB, ldsNumBytesAlignedMXSB = calcLdsNumBytes(padA, ldsBlockSizePerPadA, padB, ldsBlockSizePerPadB) - if (ldsNumBytesAlignedA + ldsNumBytesAlignedB) > globalParameters["MaxLDS"]: + if (ldsNumBytesAlignedA + ldsNumBytesAlignedB) > state["MaxLDS"]: if wlrA > 1: state["LocalReadVectorWidthA"] //= 2 if wlrB > 1: From 980ead5c95a9d4be9ef8cc67ee6000dc46d6e82b Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 17 Feb 2026 18:58:41 -0600 Subject: [PATCH 3/3] Fix write offsets for DirectToLDS --- .../hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index a4aea132c09..0121c83ad61 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -3294,7 +3294,8 @@ def graFinalOffsetsSingleLoopGNLC(self, kernel, tP, tc, margin = -1): module.add(VAddU32(dst=vgpr(grov), src0=vgpr(tmpv), src1=vgpr(grov), \ comment="final" )) module.add(VLShiftLeftB32(dst=vgpr(grov), shiftHex=log2(tP["bpeGR"]), src=vgpr(grov))) - module.add(VAddU32(dst=vgpr(grov), src0=self.states.srdShiftLeft[tc] * tP["bpeGR"] , src1=vgpr(grov), \ + ptrshift = int(self.states.srdShiftLeft[tc] * tP["bpeGR"]) + module.add(VAddU32(dst=vgpr(grov), src0=ptrshift , src1=vgpr(grov), \ comment="ptr-shift" )) self.vgprPool.checkIn(tmpv) @@ -4257,8 +4258,8 @@ def lwaFirstOffset(self, kernel, tP): dst=sgpr("LocalWriteAddr%s"%tc), \ src=vgpr(tmpv), \ comment="Copy lds write address VGPR to SGPR")) - module.add(SMulI32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), \ - src1=((kernel["WavefrontSize"] * kernel["GlobalReadVectorWidth%c"%tc]+kernel["LdsPad%s"%tc]) * tP["bpeGR"]) )) + lwastride = int((kernel["WavefrontSize"] * kernel["GlobalReadVectorWidth%c"%tc]+kernel["LdsPad%s"%tc]) * tP["bpeGR"]) + module.add(SMulI32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), src1=lwastride )) if tc == 'B': module.add(SAddU32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), \ src1=kernel["LdsOffsetB"] ))