diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index c4de12d1900..0121c83ad61 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -3294,7 +3294,8 @@ def graFinalOffsetsSingleLoopGNLC(self, kernel, tP, tc, margin = -1): module.add(VAddU32(dst=vgpr(grov), src0=vgpr(tmpv), src1=vgpr(grov), \ comment="final" )) module.add(VLShiftLeftB32(dst=vgpr(grov), shiftHex=log2(tP["bpeGR"]), src=vgpr(grov))) - module.add(VAddU32(dst=vgpr(grov), src0=self.states.srdShiftLeft[tc] * tP["bpeGR"] , src1=vgpr(grov), \ + ptrshift = int(self.states.srdShiftLeft[tc] * tP["bpeGR"]) + module.add(VAddU32(dst=vgpr(grov), src0=ptrshift , src1=vgpr(grov), \ comment="ptr-shift" )) self.vgprPool.checkIn(tmpv) @@ -4257,8 +4258,8 @@ def lwaFirstOffset(self, kernel, tP): dst=sgpr("LocalWriteAddr%s"%tc), \ src=vgpr(tmpv), \ comment="Copy lds write address VGPR to SGPR")) - module.add(SMulI32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), \ - src1=((kernel["WavefrontSize"] * kernel["GlobalReadVectorWidth%c"%tc]+kernel["LdsPad%s"%tc]) * tP["bpeGR"]) )) + lwastride = int((kernel["WavefrontSize"] * kernel["GlobalReadVectorWidth%c"%tc]+kernel["LdsPad%s"%tc]) * tP["bpeGR"]) + module.add(SMulI32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), src1=lwastride )) if tc == 'B': module.add(SAddU32(dst=sgpr("LocalWriteAddr%s"%tc), src0=sgpr("LocalWriteAddr%s"%tc), \ src1=kernel["LdsOffsetB"] )) @@ -5518,8 +5519,8 @@ def generateReLoadLoop(tc): jumpLabel(tP, sLoadTileIdx, checkAddrLabel) imod.add(checkAddrLabel) imod.add(VSubU32(dst=vgpr(tmpVgpr), src0=vgpr(tmpVgpr), - src1=self.states.srdShiftLeft[tc] * tP["bpeGR"], comment="sub prepad")) - loadRangePerThread = tP["glvw"] * tP["bpeGR"] - 1 + src1=int(self.states.srdShiftLeft[tc] * tP["bpeGR"]), comment="sub prepad")) + loadRangePerThread = int(tP["glvw"] * tP["bpeGR"] - 1) imod.add(VAddU32(dst=vgpr(tmpVgpr+1), src0=vgpr(tmpVgpr), src1=loadRangePerThread, \ comment="Calculate load range per thread")) imod.add(VCmpLtI32(dst=sgpr(sCmpLoadStartAddrStatusx2, 2), src0=vgpr(tmpVgpr), \ @@ -8439,7 +8440,7 @@ def globalReadGuardKBody(tP, optParams = None): destVgpr="G2L%s+%u+%u"%(tc, g2lIdx + tP["shiftGR"] if not tP["isM"] else g2lIdxM, regIdx+eccOffset) self.vgprs.globalReadRegisters[tc].append( (g2lIdx + tP["shiftGR"] if not tP["isM"] else g2lIdxM) + regIdx+eccOffset) - offset = r * tP["bpeGR"] + instOffset + offset = int(r * tP["bpeGR"] + instOffset) comment = "load one buffer value" if (dataType.isHalf() or dataType.isBFloat16()) and not tP["isM"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index ae964d8cadf..b6cd010a90c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -1716,7 +1716,7 @@ def depthUIteration( state["_staggerStrideShift"] = (int)(math.ceil(math.log(state["StaggerUStride"] / (state["DepthU"] * bpeA), 2))) - def calcLdsPad(lrvw: int, isaInfoMap: Dict[str, IsaInfo]) -> int: + def calcLdsPad(isaInfoMap: Dict[str, IsaInfo]) -> int: lrvwA = state["LocalReadVectorWidthA"] lrvwB = state["LocalReadVectorWidthB"] ldsPadA = state["LdsPadA"] @@ -2021,11 +2021,11 @@ def calcLdsNumBytes(ldsPadA: int, LdsBlockSizePerPadA: int, ldsPadB: int, LdsBlo wlrB = max(state["LocalReadVectorWidthB"] // state["MIInputPerThread"], 1) if (wlrA > 1) or (wlrB > 1): - padA, padB, padM = calcLdsPad() + padA, padB, padM = calcLdsPad(isaInfoMap) ldsBlockSizePerPadA, ldsBlockSizePerPadB = calcLdsBlockSizePerPad() ldsNumBytesA, ldsNumBytesAlignedA, ldsNumBytesB, ldsNumBytesAlignedB, ldsNumBytesMetadata, ldsNumBytesAlignedMetadata, \ ldsNumBytesMXSA, ldsNumBytesAlignedMXSA, ldsNumBytesMXSB, ldsNumBytesAlignedMXSB = calcLdsNumBytes(padA, ldsBlockSizePerPadA, padB, ldsBlockSizePerPadB) - if (ldsNumBytesAlignedA + ldsNumBytesAlignedB) > globalParameters["MaxLDS"]: + if (ldsNumBytesAlignedA + ldsNumBytesAlignedB) > state["MaxLDS"]: if wlrA > 1: state["LocalReadVectorWidthA"] //= 2 if wlrB > 1: @@ -3073,7 +3073,7 @@ def subCheckLdsBlockSizePerPad(tc, idx): state["NoLdsWriteCode"] = True # calculate ldsPad - state["LdsPadA"], state["LdsPadB"], state["LdsPadMetadata"] = calcLdsPad(state["LocalReadVectorWidth"], isaInfoMap) + state["LdsPadA"], state["LdsPadB"], state["LdsPadMetadata"] = calcLdsPad(isaInfoMap) if state["GlobalReadVectorWidthA"] * state["ProblemType"]["MacDataTypeA"].numBytes() == 32 and state["LdsPadA"] == 16 // state["ProblemType"]["MacDataTypeA"].numBytes(): if auto_LdsBlockSizePerPadA_for_mix: