diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 5d55cbdebda..255a41aeb16 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -20,15 +20,16 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ +from rocisa.enum import CacheScope from rocisa.code import Module, Label -from rocisa.container import vgpr, sgpr, SMEMModifiers, replaceHolder, EXEC,\ +from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\ VOP3PModifiers, ContinuousRegister from rocisa.instruction import SAddCU32, SAddI32, SAddU32, SAndB32, SBarrier, \ SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \ SCmpGtU32, SCmpLeU32, SCmpLtU32, SCmpGeU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \ SMinU32, SMovB32, SMovB64, SMulI32, SNop, SSleep, SStoreB32, SSubU32, \ SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \ - VReadfirstlaneB32, VCvtBF16toFP32 + VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \ vectorStaticMultiply, BranchIfNotZero, scalarUInt32DivideAndRemainder @@ -228,7 +229,7 @@ def computeStoreSrdStartCommon(self, writer, kernel): tmpSgpr0 = tmpSgprInfo.idx+1 tmpSgpr1 = tmpSgprInfo.idx+2 tmpSgpr2 = tmpSgprInfo.idx+0 - tmpSgpr3 = tmpSgprInfo.idx+3 + tmpSgpr3 = tmpSgprInfo.idx+3 module.addComment("Split Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * SplitIdx * bpe%s") # PartialIdx was saved in sgprBeta for re-use module.addModuleAsFlatItems(writer.s_mul_u64_u32(sgpr(tmpSgpr0), sgpr(tmpSgpr1), sgpr("SizesFree+0"), sgpr("SkPartialIdx"), comment="Free0")) @@ -427,7 +428,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index")) module.add(skFixupWaitForFlag) # loop to wait for flag - module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag")) + module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written @@ -489,7 +490,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele # Check flag module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index")) - module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag")) + module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) if kernel["DebugStreamK"] & 2 == 0: @@ -502,8 +503,12 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(VReadfirstlaneB32(dst=sgpr(tmpSgpr+2), src=vgpr("Serial"), comment="Wave 0 updates flags")) module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=0, comment="Check for wave 0")) module.add(SCBranchSCC0(labelName=skipFlagReset.getLabelName(), comment="Skip flag reset")) - # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag - module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag")) + if writer.states.asmCaps["HasScalarStore"]: + # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag + module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag")) + else: + module.add(VMovB32(dst=vgpr(tmpVgpr), src=0, comment="move 0 to tmpVgpr")) + module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="reset flag")) module.add(skipFlagReset) writer.sgprPool.checkIn(tmpSgpr) @@ -806,8 +811,12 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, module.add(VReadfirstlaneB32(dst=sgpr(flagSgpr), src=vgpr("Serial"), comment="Wave 0 updates flags")) module.add(SCmpEQU32(src0=sgpr(flagSgpr), src1=0, comment="Check for wave 0")) module.add(SCBranchSCC0(labelName=skipFlagSet.getLabelName(), comment="Skip flag set")) - module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data")) - module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag")) + if writer.states.asmCaps["HasScalarStore"]: + module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data")) + module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag")) + else: + module.add(VMovB32(dst=vgpr(tmpVgpr), src=1, comment="move 1 to tmpVgpr")) + module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="set flag")) module.add(skipFlagSet) module.add(SWaitCnt(kmcnt=0, comment="wait for flag")) # TODO just for testing @@ -818,6 +827,20 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, return module + def setFlagValue(self, writer, src, soffset, comment=""): + module = Module("Buffer Store Flag Value") + tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False) + module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2))) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) + module.add(BufferStoreB32(src=src, vaddr=vgpr("off", isOff=True), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ + mubuf=MUBUFModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \ + comment=comment)) + module.add(SWaitCnt(vscnt=0, comment="wait for data store")) #TODO: See if this wait is necessery + writer.sgprPool.checkIn(tmpSgprBuffer) + + return module + def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \ batchElements, addrD, addrC, \ tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead): @@ -1753,6 +1776,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Basic SK module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr("StreamKIdx"), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration")) @@ -1831,6 +1860,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Two-tile SK (SK first) # iter count after all extra iters have been distributed @@ -1950,6 +1985,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Two-tile SK (DP first) # Do DP tiles before SK @@ -1979,13 +2020,13 @@ def preLoop(self, writer, kernel): # if (partialIdx < extraIters) then (skIter = partialIdx * (itersPerWG + 1)) else (skIter = partialIdx * itersPerWG + extraIters) skHasExtraLabel = Label("SK_HasExtra", "") skDoneExtraLabel = Label("SK_DoneExtra", "") - + # PartialIdx = itersPerTile % skSplit (skSplit is passed as SkSplit) # extraIters = ItersPerTile - SkSplit * skItersPerWG sSkExtraIters = writer.sgprPool.checkOut(1, "extraIters") module.add(SMulI32(dst=sgpr(sSkExtraIters), src0=sgpr("SkSplit"), src1=sgpr("SKItersPerWG"))) module.add(SSubU32(dst=sgpr(sSkExtraIters), src0=sgpr("ItersPerTile"), src1=sgpr(sSkExtraIters), comment="extraIters = itersPerTile - SkSplit * skItersPerWG")) - + module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr(stmpPartialIdx), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration (case: after extra iters)")) module.add(SCmpLtU32(src0=sgpr(stmpPartialIdx), src1=sgpr(sSkExtraIters), comment="Check if WG gets an extra iteration")) module.add(SCBranchSCC1(labelName=skHasExtraLabel.getLabelName(), comment="Has extra iter")) @@ -2007,7 +2048,7 @@ def preLoop(self, writer, kernel): module.add(SMovB32(dst=sgpr("SkPartialIdx"), src=sgpr(stmpPartialIdx), comment="Save partial idx for SrdD calculation")) # Done init module.add(SBranch(labelName=skInitDone.getLabelName(), comment="Done init for parallel reduction")) - + # # Save PratialIdx for later, skExtraIters is unused for partial reduction # module.add(SMovB32(dst=sgpr("skExtraIters"), src=sgpr(stmpPartialIdx), comment="Save partial idx for SrdD calculation")) # # StreamKIter = tile * itersPerTile + itersPerWG * partialIndex diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index fef11d022f1..9fd8cc1c730 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -38,7 +38,7 @@ scalarStaticRemainder, scalarUInt32DivideAndRemainder, sMagicDiv, vectorStaticMultiply, \ vectorStaticMultiplyAdd, scalarStaticMultiply64, BranchIfZero, BranchIfNotZero, DSInit, \ ArgumentLoader -from rocisa.enum import InstType, SelectBit +from rocisa.enum import InstType, SelectBit, CacheScope from rocisa.macro import MacroVMagicDiv, PseudoRandomGenerator from . import CUSTOM_KERNEL_PATH from rocisa.instruction import BranchInstruction, BufferLoadB128, BufferLoadB32, \ @@ -12295,7 +12295,9 @@ def bufferLoadImpl(soffset): ############################################################################## def chooseGlobalWrite(self, useBuffer, bps, srcVgpr, rpv, \ addr0, addr1, offset, soffset=0, \ - glc=False, slc=False, nt=False, hi16=0, comment="store"): + glc=False, slc=False, nt=False, dlc=False, \ + scope=CacheScope.SCOPE_NONE, \ + hi16=0, comment="store"): """ create the store instruction for requested vector width and other parms rpv = regs per vector @@ -12338,7 +12340,7 @@ def bufferStoreImpl(tmpSgpr, mubuf): if offset2 >= 4096: module.add(SMovB32(dst=tmpSgpr, src=offset2, comment="large offset")) offset2 = 0 - mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) vgprOff = int(srcVgpr + shiftRpv * i) if isinstance(srcVgpr, int) else f"{srcVgpr}+{int(shiftRpv * i)}" module.add(BufferStoreB128(src=vgpr(vgprOff, shiftRpv), vaddr=addr0, \ saddr=addr1, soffset=tmpSgpr, mubuf=mubuf2, comment=comment)) @@ -12346,7 +12348,7 @@ def bufferStoreImpl(tmpSgpr, mubuf): assert 0, "bad bps" if useBuffer: - mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) if soffset != 0: assert offset < 4096, "sgpr offset provided with large const offset" # buffer_load offset field is 12-bit. @@ -12357,13 +12359,13 @@ def bufferStoreImpl(tmpSgpr, mubuf): tmpSgpr = sgpr(tmpSgprInfo.idx) if offset >= 4096: module.add(SMovB32(dst=tmpSgpr, src=offset, comment="large offset")) - mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) bufferStoreImpl(tmpSgpr, mubuf) else: bufferStoreImpl(soffset, mubuf) else: - flat = FLATModifiers(glc=glc, slc=slc, isStore=True) + flat = FLATModifiers(glc=glc, slc=slc, dlc=dlc, scope=scope, isStore=True) if bps==2 and hi16: module.add(FlatStoreD16HIB16(vaddr=addr0, src=vgpr(srcVgpr*2), flat=flat, comment=comment)) elif bps==2 and not hi16: @@ -12493,6 +12495,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset isGlc = False isSlc = False isNT = False + scope = CacheScope.SCOPE_NONE + isDlc = False if tc == 'D': isGlc = bool(kernel["NonTemporalD"] & 0x1) @@ -12533,6 +12537,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset isGlc = True isSlc = True isNT = bool(kernel["NonTemporalD"] & 0x4) + isDlc = True + scope = CacheScope.SCOPE_DEV bps = self.states.bpeCinternal * ss.cfg.gwvw rpv = self.states.bpeCinternal * ss.cfg.gwvw / self.states.bpr @@ -12582,36 +12588,36 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset if self.states.asmCaps["HasWMMA_V1"] and kernel["EnableMatrixInstruction"]: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment)) else: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx//2, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=sumIdx%2, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=sumIdx%2, comment=comment)) else: # (B,B,B,B,S,S), internal S # (H,H,H,H,H,H), internal S # (H,H,H,H,S,S), internal S module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment)) elif dataType.isInt32() or dataType.isSingle(): module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isDouble() or dataType.isSingleComplex(): module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*2, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isDoubleComplex(): rps = dataType.numRegisters() module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*rps, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isInt8() or dataType.isAnyFloat8() or dataType.isAnyBFloat8() or dataType.isAnyFloat8BFloat8() or dataType.isAnyBFloat8Float8(): if kernel["ProblemType"]["HighPrecisionAccumulate"]: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) return module ############################################################################## diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index b7162c7f21b..f7bd5ad5353 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -980,8 +980,6 @@ def assignDerivedParameters( state["GlobalSplitUAlgorithm"] = "MultipleBuffer" # Set default Algorithm if not state["EnableMatrixInstruction"]: reject(state, printRejectionReason, "Stream-K requires MatrixInstruction") - if isaInfoMap[isa].asmCaps["HasWMMA"]: - reject(state, printRejectionReason, "Stream-K untested with WMMA") # if state["PersistentKernel"]: # reject(state, printRejectionReason, "Cannot enable both Stream-K and PersistentKernel") if not state["ProblemType"]["StridedBatched"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml new file mode 100644 index 00000000000..d949ba5f633 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml @@ -0,0 +1,456 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml new file mode 100644 index 00000000000..675d87bde37 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml @@ -0,0 +1,456 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml new file mode 100644 index 00000000000..da6174efcb8 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml @@ -0,0 +1,420 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] \ No newline at end of file diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml new file mode 100644 index 00000000000..a483f8ab460 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml @@ -0,0 +1,420 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - StreamKFixupTreeReduction: [0] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] \ No newline at end of file diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp index a73f04c1ec9..c5e916130e6 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/base.hpp @@ -270,6 +270,6 @@ namespace rocisa std::string isaToGfx(const nb::tuple& arch); std::string isaToGfx(const IsaVersion& arch); - std::string getGlcBitName(bool hasGLCModifier); - std::string getSlcBitName(bool hasGLCModifier); + std::string getGlcBitName(); + std::string getSlcBitName(); } // namespace rocisa diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp index 8079d5f1098..693c925339a 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp @@ -102,15 +102,19 @@ namespace rocisa struct FLATModifiers : public Container { - FLATModifiers(int offset12 = 0, - bool glc = false, - bool slc = false, - bool lds = false, - bool isStore = false) + FLATModifiers(int offset12 = 0, + bool glc = false, + bool slc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool lds = false, + bool isStore = false) : Container() , offset12(offset12) , glc(glc) , slc(slc) + , dlc(dlc) + , scope(scope) , lds(lds) , isStore(isStore) { @@ -121,6 +125,8 @@ namespace rocisa , offset12(other.offset12) , glc(other.glc) , slc(other.slc) + , dlc(other.dlc) + , scope(other.scope) , lds(other.lds) , isStore(other.isStore) { @@ -133,7 +139,8 @@ namespace rocisa std::string toString() const override { - auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; std::string kStr; if(offset12 != 0) { @@ -141,11 +148,19 @@ namespace rocisa } if(glc) { - kStr += " " + getGlcBitName(hasGLCModifier); + kStr += " " + getGlcBitName(); } if(slc) { - kStr += " " + getSlcBitName(hasGLCModifier); + kStr += " " + getSlcBitName(); + } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); } if(lds) { @@ -154,11 +169,13 @@ namespace rocisa return kStr; } - int offset12; - bool glc; - bool slc; - bool lds; - bool isStore; + int offset12; + bool glc; + bool slc; + bool dlc; + CacheScope scope; + bool lds; + bool isStore; }; struct GLOBALModifiers : public Container @@ -190,23 +207,27 @@ namespace rocisa return kStr; } - int offset; + int offset; }; struct MUBUFModifiers : public Container { - MUBUFModifiers(bool offen = false, - int offset12 = 0, - bool glc = false, - bool slc = false, - bool nt = false, - bool lds = false, - bool isStore = false) + MUBUFModifiers(bool offen = false, + int offset12 = 0, + bool glc = false, + bool slc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool nt = false, + bool lds = false, + bool isStore = false) : Container() , offen(offen) , offset12(offset12) , glc(glc) , slc(slc) + , dlc(dlc) + , scope(scope) , nt(nt) , lds(lds) , isStore(isStore) @@ -219,6 +240,8 @@ namespace rocisa , offset12(other.offset12) , glc(other.glc) , slc(other.slc) + , dlc(other.dlc) + , scope(other.scope) , nt(other.nt) , lds(other.lds) , isStore(other.isStore) @@ -232,25 +255,29 @@ namespace rocisa std::string toString() const override { - auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; - auto hasSLCModifier = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"]; - auto hasNTModifier = rocIsa::getInstance().getAsmCaps()["HasNTModifier"]; + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; + auto hasNTModifier = rocIsa::getInstance().getAsmCaps()["HasNTModifier"]; std::string kStr; if(offen) { kStr += " offen offset:" + std::to_string(offset12); } - if(glc || slc || lds) - { - kStr += ","; - } if(glc) { - kStr += " " + getGlcBitName(hasGLCModifier); + kStr += " " + getGlcBitName(); } if(slc) { - kStr += " " + getSlcBitName(hasGLCModifier); + kStr += " " + getSlcBitName(); + } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); } if(hasNTModifier && nt) { @@ -263,20 +290,28 @@ namespace rocisa return kStr; } - bool offen; - int offset12; - bool glc; - bool slc; - bool nt; - bool lds; - bool isStore; + bool offen; + int offset12; + bool glc; + bool slc; + bool dlc; + CacheScope scope; + bool nt; + bool lds; + bool isStore; }; struct SMEMModifiers : public Container { - SMEMModifiers(bool glc = false, bool nv = false, int offset = 0) + SMEMModifiers(bool glc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool nv = false, + int offset = 0) : Container() , glc(glc) + , dlc(dlc) + , scope(scope) , nv(nv) , offset(offset) // 20u 21s shaes the same { @@ -285,6 +320,8 @@ namespace rocisa SMEMModifiers(const SMEMModifiers& other) : Container() , glc(other.glc) + , dlc(other.dlc) + , scope(other.scope) , nv(other.nv) , offset(other.offset) { @@ -297,15 +334,25 @@ namespace rocisa std::string toString() const override { + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; std::string kStr; if(offset != 0) { kStr += " offset:" + std::to_string(offset); } - if(glc) + if(!hasSCOPEModifier && glc) { kStr += " glc"; } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); + } if(nv) { kStr += " nv"; @@ -313,9 +360,11 @@ namespace rocisa return kStr; } - bool glc; - bool nv; - int offset; + bool glc; + bool dlc; + CacheScope scope; + bool nv; + int offset; }; struct SDWAModifiers : public Container @@ -651,6 +700,7 @@ namespace rocisa bool isMinus; bool isAbs; bool isMacro; + bool isOff; RegisterContainer(const std::string& regType, const std::optional& regName, @@ -665,6 +715,7 @@ namespace rocisa , isMinus(false) , isAbs(false) , isMacro(false) + , isOff(false) { } @@ -672,6 +723,7 @@ namespace rocisa const std::optional& regName, bool isAbs, bool isMacro, + bool isOff, int regIdx = 0, float regNum = 1) : Container() @@ -683,6 +735,7 @@ namespace rocisa , isMinus(false) , isAbs(isAbs) , isMacro(isMacro) + , isOff(isOff) { } @@ -696,6 +749,7 @@ namespace rocisa , isMinus(other.isMinus) , isAbs(other.isAbs) , isMacro(other.isMacro) + , isOff(other.isOff) { } @@ -719,6 +773,7 @@ namespace rocisa , isMinus(other.isMinus) , isAbs(other.isAbs) , isMacro(other.isMacro) + , isOff(other.isOff) { } @@ -734,6 +789,7 @@ namespace rocisa isMinus = other.isMinus; isAbs = other.isAbs; isMacro = other.isMacro; + isOff = other.isOff; } return *this; } @@ -750,6 +806,7 @@ namespace rocisa isMinus = other.isMinus; isAbs = other.isAbs; isMacro = other.isMacro; + isOff = other.isOff; } return *this; } @@ -861,6 +918,11 @@ namespace rocisa std::string toString() const override { + if(isOff) + { + return "off"; + } + std::string minusStr = isMinus ? "-" : ""; minusStr = isAbs ? "abs(" + minusStr : minusStr; auto absStr = isAbs ? ")" : ""; @@ -1122,8 +1184,11 @@ namespace rocisa // Overloaded functions to create specific GPR containers with default regNum = 1.f std::shared_ptr vgpr(const Holder& holder, float regNum = 1.f); std::shared_ptr vgpr(int idx, float regNum = 1.f); - std::shared_ptr - vgpr(const std::string& name, float regNum = 1.f, bool isMacro = false, bool isAbs = false); + std::shared_ptr vgpr(const std::string& name, + float regNum = 1.f, + bool isMacro = false, + bool isAbs = false, + bool isOff = false); std::shared_ptr sgpr(const Holder& holder, float regNum = 1.f); std::shared_ptr sgpr(int idx, float regNum = 1.f); std::shared_ptr diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp index 2bbc4fdb2bf..e5d6ee55a54 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp @@ -224,6 +224,15 @@ namespace rocisa UNUSED_PRESERVE = 3 }; + enum class CacheScope : int + { + SCOPE_NONE = 0, + SCOPE_CU = 1, + SCOPE_SE = 2, + SCOPE_DEV = 3, + SCOPE_SYS = 4, + }; + enum class CvtType : int { CVT_F16_to_F32 = 1, @@ -298,6 +307,23 @@ namespace rocisa } } + inline std::string toString(CacheScope scope) + { + switch(scope) + { + case CacheScope::SCOPE_CU: + return "SCOPE_CU"; + case CacheScope::SCOPE_SE: + return "SCOPE_SE"; + case CacheScope::SCOPE_DEV: + return "SCOPE_DEV"; + case CacheScope::SCOPE_SYS: + return "SCOPE_SYS"; + default: + return ""; + } + } + enum class SaturateCastType : int { NORMAL = 1, diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index 7e0512ee608..1452a84e91a 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -69,16 +69,17 @@ inline bool tryAssembler(const IsaVersion& isaVersion, return true; } -inline int getMaxCnt(const IsaVersion& isaVersion, +inline int getMaxCnt(const IsaVersion& isaVersion, const std::string& assemblerPath, const std::string& prefix, const std::string& suffix, - bool isDebug) + bool isDebug) { for(int p = 64; p > 1; p >>= 1) { // Try ( pow(2) - 1 ) from high to low - if(tryAssembler(isaVersion, assemblerPath, prefix + std::to_string(p - 1) + suffix, isDebug)) + if(tryAssembler( + isaVersion, assemblerPath, prefix + std::to_string(p - 1) + suffix, isDebug)) return p - 1; } return 0; @@ -117,7 +118,9 @@ inline std::map = tryAssembler(isaVersion, assemblerPath, "v_lshl_or_b32 v47, v36, 0x2, v34", isDebug); rv["HasSMulHi"] = tryAssembler(isaVersion, assemblerPath, "s_mul_hi_u32 s47, s36, s34", isDebug); - + rv["HasScalarStore"] + = tryAssembler(isaVersion, assemblerPath, "s_store_dword s79, s[70:71], s77", isDebug) + || tryAssembler(isaVersion, assemblerPath, "s_store_b32 s79, s[70:71], s77", isDebug); rv["HasMFMA_explictB"] = tryAssembler( isaVersion, assemblerPath, "v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]", isDebug); rv["HasMFMA"] = tryAssembler(isaVersion, @@ -276,6 +279,35 @@ inline std::map assemblerPath, "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, glc", isDebug); + rv["HasSC0Modifier"] + = tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, sc0", + isDebug) + || tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, sc0", + isDebug); + rv["HasDLCModifier"] + = tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, dlc", + isDebug) + || tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, dlc", + isDebug); + rv["HasSCOPEModifier"] + = tryAssembler( + isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, scope:SCOPE_DEV", + isDebug) + || tryAssembler( + isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, scope:SCOPE_DEV", + isDebug); rv["HasMUBUFConst"] = tryAssembler(isaVersion, assemblerPath, "buffer_load_dword v40, v36, s[24:27], 1 offen offset:0", @@ -303,18 +335,19 @@ inline std::map rv["s_delay_alu"] = tryAssembler(isaVersion, assemblerPath, "s_delay_alu instid0(VALU_DEP_1)", isDebug); - rv["SeparateVscnt"] = tryAssembler(isaVersion, assemblerPath, "s_waitcnt_vscnt null 0", isDebug); + rv["SeparateVscnt"] + = tryAssembler(isaVersion, assemblerPath, "s_waitcnt_vscnt null 0", isDebug); rv["SeparateLGKMcnt"] = tryAssembler(isaVersion, assemblerPath, "s_wait_dscnt 0", isDebug) - && tryAssembler(isaVersion, assemblerPath, "s_wait_kmcnt 0", isDebug); + && tryAssembler(isaVersion, assemblerPath, "s_wait_kmcnt 0", isDebug); rv["SeparateVMcnt"] = tryAssembler(isaVersion, assemblerPath, "s_wait_loadcnt 0", isDebug) - && tryAssembler(isaVersion, assemblerPath, "s_wait_storecnt 0", isDebug); + && tryAssembler(isaVersion, assemblerPath, "s_wait_storecnt 0", isDebug); if(rv["SeparateVMcnt"]) { // s_wait_loadcnt accept 16 bits immediate, but only use the lowest 6 bits are used, can't use tryAssembler - rv["MaxLoadcnt"] = 63; + rv["MaxLoadcnt"] = 63; // s_wait_storecnt accept 16 bits immediate, but only use the lowest 6 bits are used, can't use tryAssembler rv["MaxStorecnt"] = 63; } diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp index cf0446b8f48..a24dbe27b12 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp @@ -831,7 +831,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -855,7 +855,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -879,7 +879,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -903,7 +903,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -1210,7 +1210,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1234,7 +1234,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1258,7 +1258,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1282,7 +1282,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment) { } diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp index b4b8b9fe829..b2a1120c73f 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/base.cpp @@ -37,18 +37,26 @@ namespace rocisa return getGfxNameTuple(arch); } - std::string getGlcBitName(bool hasGLCModifier) + std::string getGlcBitName() { + auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasSC0Modifier = rocIsa::getInstance().getAsmCaps()["HasSC0Modifier"]; if(hasGLCModifier) return "glc"; - return "sc0"; + if(hasSC0Modifier) + return "sc0"; + return ""; } - std::string getSlcBitName(bool hasGLCModifier) + std::string getSlcBitName() { + auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasSC0Modifier = rocIsa::getInstance().getAsmCaps()["HasSC0Modifier"]; if(hasGLCModifier) return "slc"; - return "sc1"; + if(hasSC0Modifier) + return "sc1"; + return ""; } // Force init the instance diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp index a9384c22744..cf990539973 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp @@ -105,10 +105,12 @@ namespace rocisa const std::string& name, float regNum = 1.f, bool isMacro = false, - bool isAbs = false) + bool isAbs = false, + bool isOff = false) { RegName regname = generateRegName(name); - return std::make_shared(gprType, regname, isAbs, isMacro, -1, regNum); + return std::make_shared( + gprType, regname, isAbs, isMacro, isOff, -1, regNum); } // Overloaded functions to create specific GPR containers with default regNum = 1.f @@ -123,9 +125,9 @@ namespace rocisa } std::shared_ptr - vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs) + vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs, bool isOff) { - return createGPR("v", name, regNum, isMacro, isAbs); + return createGPR("v", name, regNum, isMacro, isAbs, isOff); } std::shared_ptr sgpr(const Holder& holder, float regNum) @@ -187,11 +189,12 @@ void init_containers(nb::module_ m) nb::arg("idx"), nb::arg("regNum") = 1.f); m_con.def("vgpr", - nb::overload_cast(&rocisa::vgpr), + nb::overload_cast(&rocisa::vgpr), nb::arg("name"), nb::arg("regNum") = 1.f, nb::arg("isMacro") = false, - nb::arg("isAbs") = false); + nb::arg("isAbs") = false, + nb::arg("isOff") = false); m_con.def("sgpr", nb::overload_cast(&rocisa::sgpr), @@ -276,10 +279,12 @@ void init_containers(nb::module_ m) std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); }); nb::class_(m_con, "FLATModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("offset12") = 0, nb::arg("glc") = false, nb::arg("slc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("lds") = false, nb::arg("isStore") = false) .def_rw("isStore", &rocisa::FLATModifiers::isStore) @@ -290,14 +295,25 @@ void init_containers(nb::module_ m) }) .def("__getstate__", [](const rocisa::FLATModifiers& self) { - return std::make_tuple(self.offset12, self.glc, self.slc, self.lds, self.isStore); + return std::make_tuple(self.offset12, + self.glc, + self.slc, + self.dlc, + self.scope, + self.lds, + self.isStore); }) - .def( - "__setstate__", - [](rocisa::FLATModifiers& self, std::tuple t) { - new(&self) rocisa::FLATModifiers( - std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); - }); + .def("__setstate__", + [](rocisa::FLATModifiers& self, + std::tuple t) { + new(&self) rocisa::FLATModifiers(std::get<0>(t), + std::get<1>(t), + std::get<2>(t), + std::get<3>(t), + std::get<4>(t), + std::get<5>(t), + std::get<6>(t)); + }); nb::class_(m_con, "GLOBALModifiers") .def(nb::init(), @@ -318,11 +334,13 @@ void init_containers(nb::module_ m) }); nb::class_(m_con, "MUBUFModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("offen") = false, nb::arg("offset12") = 0, nb::arg("glc") = false, nb::arg("slc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("nt") = false, nb::arg("lds") = false, nb::arg("isStore") = false) @@ -332,27 +350,37 @@ void init_containers(nb::module_ m) [](const rocisa::MUBUFModifiers& self, nb::dict&) { return rocisa::MUBUFModifiers(self); }) - .def( - "__getstate__", - [](const rocisa::MUBUFModifiers& self) { - return std::make_tuple( - self.offen, self.offset12, self.glc, self.slc, self.nt, self.lds, self.isStore); - }) + .def("__getstate__", + [](const rocisa::MUBUFModifiers& self) { + return std::make_tuple(self.offen, + self.offset12, + self.glc, + self.slc, + self.dlc, + self.scope, + self.nt, + self.lds, + self.isStore); + }) .def("__setstate__", - [](rocisa::MUBUFModifiers& self, - std::tuple t) { + [](rocisa::MUBUFModifiers& self, + std::tuple t) { new(&self) rocisa::MUBUFModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t), std::get<5>(t), - std::get<6>(t)); + std::get<6>(t), + std::get<7>(t), + std::get<8>(t)); }); nb::class_(m_con, "SMEMModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("glc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("nv") = false, nb::arg("offset") = 0) .def("__str__", &rocisa::SMEMModifiers::toString) @@ -362,11 +390,15 @@ void init_containers(nb::module_ m) }) .def("__getstate__", [](const rocisa::SMEMModifiers& self) { - return std::make_tuple(self.glc, self.nv, self.offset); + return std::make_tuple(self.glc, self.dlc, self.scope, self.nv, self.offset); }) - .def("__setstate__", [](rocisa::SMEMModifiers& self, std::tuple t) { - new(&self) rocisa::SMEMModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t)); - }); + .def( + "__setstate__", + [](rocisa::SMEMModifiers& self, + std::tuple t) { + new(&self) rocisa::SMEMModifiers( + std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); + }); nb::class_(m_con, "SDWAModifiers") .def(nb::init(), diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp index 287771b3350..ecac17cd9b9 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp @@ -130,6 +130,14 @@ void init_enum(nb::module_ m) .value("UNUSED_PRESERVE", rocisa::UnusedBit::UNUSED_PRESERVE) .export_values(); + nb::enum_(m_enum, "CacheScope") + .value("SCOPE_NONE", rocisa::CacheScope::SCOPE_NONE) + .value("SCOPE_CU", rocisa::CacheScope::SCOPE_CU) + .value("SCOPE_SE", rocisa::CacheScope::SCOPE_SE) + .value("SCOPE_DEV", rocisa::CacheScope::SCOPE_DEV) + .value("SCOPE_SYS", rocisa::CacheScope::SCOPE_SYS) + .export_values(); + nb::enum_(m_enum, "CvtType") .value("CVT_F16_to_F32", rocisa::CvtType::CVT_F16_to_F32) .value("CVT_F32_to_F16", rocisa::CvtType::CVT_F32_to_F16) diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_base.py b/projects/hipblaslt/tensilelite/rocisa/test/test_base.py index e461d24e866..e675fd42976 100644 --- a/projects/hipblaslt/tensilelite/rocisa/test/test_base.py +++ b/projects/hipblaslt/tensilelite/rocisa/test/test_base.py @@ -111,8 +111,8 @@ def test_copy(): print("This is a deepcopied function:", deepcopiedFunction(isa)) def test_functions(): - print("GLC:", rocisa.getGlcBitName(True)) - print("SLC:", rocisa.getSlcBitName(False)) + print("GLC:", rocisa.getGlcBitName()) + print("SLC:", rocisa.getSlcBitName()) test_rocisa() test_item() diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py index c550419085a..9fcb7ce6d78 100644 --- a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py +++ b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py @@ -41,15 +41,23 @@ def test_containers(): assert str(ds_modifiers) == " offset:2 gds" # Test FLATModifiers - flat_modifiers = rocisa.container.FLATModifiers(8, True, False, True, False) + flat_modifiers = rocisa.container.FLATModifiers( + offset12=8, glc=True, slc=False, dlc=False, + scope=rocisa.enum.CacheScope.SCOPE_NONE, + lds=True, isStore=False) assert str(flat_modifiers) == " offset:8 glc lds" # Test MUBUFModifiers - mubuf_modifiers = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True) - assert str(mubuf_modifiers) == " offen offset:12, glc" + mubuf_modifiers = rocisa.container.MUBUFModifiers( + offen=True, offset12=12, glc=True, slc=False, + dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nt=True, lds=False, isStore=True) + assert str(mubuf_modifiers) == " offen offset:12 glc" # Test SMEMModifiers - smem_modifiers = rocisa.container.SMEMModifiers(True, False, 8) + smem_modifiers = rocisa.container.SMEMModifiers( + glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nv=False, offset=8) assert str(smem_modifiers) == " offset:8 glc" # Test SDWAModifiers @@ -104,7 +112,7 @@ def test_containers(): # Test Holder from rocisa.container import vgpr, sgpr, accvgpr, mgpr, Holder - holder = Holder("holder"); + holder = Holder("holder") assert holder.idx == -1 assert str(holder.name) == "holder" assert isinstance(holder, Holder) @@ -120,6 +128,8 @@ def test_containers(): assert str(testGpr) == "acc4" testGpr = mgpr(holder) assert str(testGpr) == "m[mgprholder]" + testGpr = vgpr("off", isOff=True) + assert str(testGpr) == "off" def test_containers_copy(): def copy_test(name, obj): @@ -133,9 +143,14 @@ def copy_test(name, obj): copy_test("DSModifiers", ds) flat = rocisa.container.FLATModifiers(lds=True) copy_test("FLATModifiers", flat) - mubuf = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True) + mubuf = rocisa.container.MUBUFModifiers( + offen=True, offset12=12, glc=True, slc=False, + dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nt=True, lds=False, isStore=True) copy_test("MUBUFModifiers", mubuf) - smem = rocisa.container.SMEMModifiers(True, False, 8) + smem = rocisa.container.SMEMModifiers( + glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nv=False, offset=8) copy_test("SMEMModifiers", smem) sdwa = rocisa.container.SDWAModifiers( dst_sel=rocisa.enum.SelectBit.WORD_0, diff --git a/shared/origami/include/origami/hardware.hpp b/shared/origami/include/origami/hardware.hpp index 2d81538a615..4648cfb5315 100644 --- a/shared/origami/include/origami/hardware.hpp +++ b/shared/origami/include/origami/hardware.hpp @@ -23,6 +23,7 @@ namespace origami Int32, BFloat16, Int8, + Int4, Int64, XFloat32, Float8_fnuz, @@ -67,6 +68,8 @@ namespace origami return 16; case data_type_t::Int8: return 8; + case data_type_t::Int4: + return 4; case data_type_t::Int64: return 64; case data_type_t::XFloat32: @@ -120,6 +123,8 @@ namespace origami return "BFloat16"; case data_type_t::Int8: return "Int8"; + case data_type_t::Int4: + return "Int4"; case data_type_t::Int64: return "Int64"; case data_type_t::XFloat32: @@ -170,6 +175,8 @@ namespace origami return data_type_t::BFloat16; if (s == "i8") return data_type_t::Int8; + if (s == "i4") + return data_type_t::Int4; if (s == "xf32") return data_type_t::XFloat32; if (s == "f8") @@ -259,6 +266,8 @@ namespace origami gfx90a, gfx942, gfx950, + gfx1201, + gfx1100, Count }; @@ -267,7 +276,9 @@ namespace origami static const std::unordered_map str_to_enum_map = {{"gfx90a", architecture_t::gfx90a}, {"gfx942", architecture_t::gfx942}, - {"gfx950", architecture_t::gfx950}}; + {"gfx950", architecture_t::gfx950}, + {"gfx1201", architecture_t::gfx1201}, + {"gfx1100", architecture_t::gfx1100}}; auto it = str_to_enum_map.find(str); if(it != str_to_enum_map.end()) @@ -319,7 +330,13 @@ namespace origami // hardware_t::architecture_constants( // 8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(-0.000013, 0.007070, 0.027355), 1.5)}}; hardware_t::architecture_constants( - 8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(0, 0.008, 0), 1.5)}}; + 8, 17, 1.21875121875121875122 * 7, 6, 4, std::make_tuple(0, 0.008, 0), 1.5)}, + {hardware_t::architecture_t::gfx1201, + hardware_t::architecture_constants( + 1, 5.74, 1.21875121875121875122 * 2.41, 0.464, 2, std::make_tuple(0, 0.17, 0), 1.5)}, + {hardware_t::architecture_t::gfx1100, + hardware_t::architecture_constants( + 1, 7.12, 1.21875121875121875122 * 3.48, 0.732, 2, std::make_tuple(0, 0.11, 0), 1.5)}}; inline static const std::unordered_map> INSTRUCTION_MAP @@ -473,6 +490,37 @@ namespace origami // DOT2 {matrix_instruction( 1, 1, 64, data_type_t::Half), 16}, // V_DOT2_F32_F16 {matrix_instruction( 1, 1, 64, data_type_t::BFloat16), 16}, // V_DOT2_F32_BF16 + }}, + {hardware_t::architecture_t::gfx1201, + { + // F16 + {matrix_instruction(16, 16, 16, data_type_t::Half), 16}, // v_wmma_f16_16x16x16_f16/v_wmma_f32_16x16x16_f16 + // BF16 + {matrix_instruction(16, 16, 16, data_type_t::BFloat16), 16}, // v_wmma_bf16_16x16x16_bf16/v_wmma_f32_16x16x16_bf16 + // F8 + {matrix_instruction(16, 16, 16, data_type_t::Float8), 8}, // v_wmma_f32_16x16x16_fp8_fp8 + // F8B8 + {matrix_instruction(16, 16, 16, data_type_t::Float8BFloat8), 8}, // v_wmma_f32_16x16x16_fp8_bf8 + // B8F8 + {matrix_instruction(16, 16, 16, data_type_t::BFloat8Float8), 8}, // v_wmma_f32_16x16x16_bf8_fp8 + // B8 + {matrix_instruction(16, 16, 16, data_type_t::BFloat8), 8}, // v_wmma_f32_16x16x16_bf8_bf8 + // I8 + {matrix_instruction(16, 16, 16, data_type_t::Int8), 8}, // v_wmma_i32_16x16x16_iu8 + // I4 + {matrix_instruction(16, 16, 16, data_type_t::Int4), 8}, // v_wmma_i32_16x16x16_iu4 + {matrix_instruction(16, 16, 32, data_type_t::Int4), 8}, // v_wmma_i32_16x16x32_iu4 + }}, + {hardware_t::architecture_t::gfx1100, + { + // F16 + {matrix_instruction(16, 16, 16, data_type_t::Half), 16}, // v_wmma_f32_16x16x16_f16/v_wmma_f16_16x16x16_f16 + // BF16 + {matrix_instruction(16, 16, 16, data_type_t::BFloat16), 16}, // v_wmma_f32_16x16x16_bf16/v_wmma_bf16_16x16x16_bf16 + // I8 + {matrix_instruction(16, 16, 16, data_type_t::Int8), 8}, // v_wmma_i32_16x16x16_iu8 + // I4 + {matrix_instruction(16, 16, 16, data_type_t::Int4), 8}, // v_wmma_i32_16x16x16_iu4 }}}; architecture_t arch;