diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index ec43cdd0cb0..d9970c833f1 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -20,15 +20,16 @@ # CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ################################################################################ +from rocisa.enum import CacheScope from rocisa.code import Module, Label -from rocisa.container import vgpr, sgpr, SMEMModifiers, replaceHolder, EXEC,\ +from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\ VOP3PModifiers, ContinuousRegister from rocisa.instruction import SAddCU32, SAddI32, SAddU32, SAndB32, SBarrier, \ SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \ SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \ SMinU32, SMovB32, SMovB64, SMulI32, SNop, SSleep, SStoreB32, SSubU32, \ SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \ - VReadfirstlaneB32, VCvtBF16toFP32 + VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32 from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \ vectorStaticMultiply, BranchIfNotZero, scalarUInt32DivideAndRemainder @@ -370,7 +371,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele # Check flag module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index")) - module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="get flag")) + module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag")) module.add(SWaitCnt(kmcnt=0, comment="wait for flag load")) if kernel["DebugStreamK"] & 2 == 0: @@ -383,8 +384,12 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele module.add(VReadfirstlaneB32(dst=sgpr(tmpSgpr+2), src=vgpr("Serial"), comment="Wave 0 updates flags")) module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=0, comment="Check for wave 0")) module.add(SCBranchSCC0(labelName=skipFlagReset.getLabelName(), comment="Skip flag reset")) - # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag - module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag")) + if writer.states.asmCaps["HasScalarStore"]: + # (tmpSgpr+2) contains a vlue of 0, use it to reset the flag + module.add(SStoreB32(src=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="reset flag")) + else: + module.add(VMovB32(dst=vgpr(tmpVgpr), src=0, comment="move 0 to tmpVgpr")) + module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="reset flag")) module.add(skipFlagReset) writer.sgprPool.checkIn(tmpSgpr) @@ -684,8 +689,12 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, module.add(VReadfirstlaneB32(dst=sgpr(flagSgpr), src=vgpr("Serial"), comment="Wave 0 updates flags")) module.add(SCmpEQU32(src0=sgpr(flagSgpr), src1=0, comment="Check for wave 0")) module.add(SCBranchSCC0(labelName=skipFlagSet.getLabelName(), comment="Skip flag set")) - module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data")) - module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag")) + if writer.states.asmCaps["HasScalarStore"]: + module.add(SMovB32(dst=sgpr(flagSgpr), src=1, comment="flag data")) + module.add(SStoreB32(src=sgpr(flagSgpr), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True), comment="set flag")) + else: + module.add(VMovB32(dst=vgpr(tmpVgpr), src=1, comment="move 1 to tmpVgpr")) + module.add(self.setFlagValue(writer, src=vgpr(tmpVgpr), soffset=sgpr(tmpSgpr), comment="set flag")) module.add(skipFlagSet) module.add(SWaitCnt(kmcnt=0, comment="wait for flag")) # TODO just for testing @@ -696,6 +705,20 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, return module + def setFlagValue(self, writer, src, soffset, comment=""): + module = Module("Buffer Store Flag Value") + tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False) + module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2))) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB")) + module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96")) + module.add(BufferStoreB32(src=src, vaddr=vgpr("off", isOff=True), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \ + mubuf=MUBUFModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \ + comment=comment)) + module.add(SWaitCnt(vscnt=0, comment="wait for data store")) #TODO: See if this wait is necessery + writer.sgprPool.checkIn(tmpSgprBuffer) + + return module + def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \ batchElements, addrD, addrC, \ tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead): @@ -1631,6 +1654,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Basic SK module.add(SMulI32(dst=sgpr("StreamKIter"), src0=sgpr("StreamKIdx"), src1=sgpr("SKItersPerWG"), comment="StreamK starting iteration")) @@ -1709,6 +1738,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Two-tile SK (SK first) # iter count after all extra iters have been distributed @@ -1825,6 +1860,12 @@ def preLoop(self, writer, kernel): xccMapping = Component.XCCMapping.find(writer) module.add(xccMapping(writer, kernel)) + # Workaround for gfx12 + if writer.states.archCaps["WorkGroupIdFromTTM"]: + module.add(SMovB32(dst=sgpr("WorkGroup0"), src="ttmp9", comment="workaround")) + module.add(SAndB32(dst=sgpr("WorkGroup1"), src0=hex(0xFFFF), src1="ttmp7", comment="workaround")) + module.add(SLShiftRightB32(dst=sgpr("WorkGroup2"), shiftHex=hex(0x10), src="ttmp7", comment="workaround")) + module.add(SMovB32(dst=sgpr("StreamKIdx"), src=sgpr("WorkGroup0"), comment="Save original StreamK index")) # Two-tile SK (DP first) # Do DP tiles before SK diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index ff426f660c3..d68ac3a46c9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -38,7 +38,7 @@ scalarStaticRemainder, scalarUInt32DivideAndRemainder, sMagicDiv, vectorStaticMultiply, \ vectorStaticMultiplyAdd, scalarStaticMultiply64, BranchIfZero, BranchIfNotZero, DSInit, \ ArgumentLoader -from rocisa.enum import InstType, SelectBit +from rocisa.enum import InstType, SelectBit, CacheScope from rocisa.macro import MacroVMagicDiv, PseudoRandomGenerator from . import CUSTOM_KERNEL_PATH from rocisa.instruction import BranchInstruction, BufferLoadB128, BufferLoadB32, \ @@ -12123,7 +12123,9 @@ def bufferLoadImpl(soffset): ############################################################################## def chooseGlobalWrite(self, useBuffer, bps, srcVgpr, rpv, \ addr0, addr1, offset, soffset=0, \ - glc=False, slc=False, nt=False, hi16=0, comment="store"): + glc=False, slc=False, nt=False, dlc=False, \ + scope=CacheScope.SCOPE_NONE, \ + hi16=0, comment="store"): """ create the store instruction for requested vector width and other parms rpv = regs per vector @@ -12166,7 +12168,7 @@ def bufferStoreImpl(tmpSgpr, mubuf): if offset2 >= 4096: module.add(SMovB32(dst=tmpSgpr, src=offset2, comment="large offset")) offset2 = 0 - mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf2 = MUBUFModifiers(offen=True, offset12=offset2, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) vgprOff = int(srcVgpr + shiftRpv * i) if isinstance(srcVgpr, int) else f"{srcVgpr}+{int(shiftRpv * i)}" module.add(BufferStoreB128(src=vgpr(vgprOff, shiftRpv), vaddr=addr0, \ saddr=addr1, soffset=tmpSgpr, mubuf=mubuf2, comment=comment)) @@ -12174,7 +12176,7 @@ def bufferStoreImpl(tmpSgpr, mubuf): assert 0, "bad bps" if useBuffer: - mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf = MUBUFModifiers(offen=True, offset12=offset, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) if soffset != 0: assert offset < 4096, "sgpr offset provided with large const offset" # buffer_load offset field is 12-bit. @@ -12185,13 +12187,13 @@ def bufferStoreImpl(tmpSgpr, mubuf): tmpSgpr = sgpr(tmpSgprInfo.idx) if offset >= 4096: module.add(SMovB32(dst=tmpSgpr, src=offset, comment="large offset")) - mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, nt=nt, isStore=True) + mubuf = MUBUFModifiers(offen=True, offset12=0, glc=glc, slc=slc, dlc=dlc, scope=scope, nt=nt, isStore=True) bufferStoreImpl(tmpSgpr, mubuf) else: bufferStoreImpl(soffset, mubuf) else: - flat = FLATModifiers(glc=glc, slc=slc, isStore=True) + flat = FLATModifiers(glc=glc, slc=slc, dlc=dlc, scope=scope, isStore=True) if bps==2 and hi16: module.add(FlatStoreD16HIB16(vaddr=addr0, src=vgpr(srcVgpr*2), flat=flat, comment=comment)) elif bps==2 and not hi16: @@ -12321,6 +12323,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset isGlc = False isSlc = False isNT = False + scope = CacheScope.SCOPE_NONE + isDlc = False if tc == 'D': isGlc = bool(kernel["NonTemporalD"] & 0x1) @@ -12361,6 +12365,8 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset isGlc = True isSlc = True isNT = bool(kernel["NonTemporalD"] & 0x4) + isDlc = True + scope = CacheScope.SCOPE_DEV bps = self.states.bpeCinternal * ss.cfg.gwvw rpv = self.states.bpeCinternal * ss.cfg.gwvw / self.states.bpr @@ -12410,36 +12416,36 @@ def addStore(self, kernel, ss, tc: str, addrCalc, sumIdx, tmpS01, edge, wsOffset if self.states.asmCaps["HasWMMA_V1"] and kernel["EnableMatrixInstruction"]: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment)) else: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx//2, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=sumIdx%2, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=sumIdx%2, comment=comment)) else: # (B,B,B,B,S,S), internal S # (H,H,H,H,H,H), internal S # (H,H,H,H,S,S), internal S module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, hi16=0, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, hi16=0, comment=comment)) elif dataType.isInt32() or dataType.isSingle(): module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isDouble() or dataType.isSingleComplex(): module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*2, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isDoubleComplex(): rps = dataType.numRegisters() module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx*rps, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) elif dataType.isInt8() or dataType.isAnyFloat8() or dataType.isAnyBFloat8() or dataType.isAnyFloat8BFloat8() or dataType.isAnyBFloat8Float8(): if kernel["ProblemType"]["HighPrecisionAccumulate"]: module.add(self.chooseGlobalWrite(useBuffer, bps, sumIdx, rpv, \ addr0, addr1, globalOffset, soffset=wsOffset, \ - glc=isGlc, slc=isSlc, nt=isNT, comment=comment)) + glc=isGlc, slc=isSlc, nt=isNT, dlc=isDlc, scope=scope, comment=comment)) return module ############################################################################## diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index e125227a01a..bb3f2bdf238 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -954,8 +954,8 @@ def assignDerivedParameters( state["GlobalSplitUAlgorithm"] = "MultipleBuffer" # Set default Algorithm if not state["EnableMatrixInstruction"]: reject(state, printRejectionReason, "Stream-K requires MatrixInstruction") - if isaInfoMap[isa].asmCaps["HasWMMA"]: - reject(state, printRejectionReason, "Stream-K untested with WMMA") + # if isaInfoMap[isa].asmCaps["HasWMMA"]: + # reject(state, printRejectionReason, "Stream-K untested with WMMA") # if state["PersistentKernel"]: # reject(state, printRejectionReason, "Cannot enable both Stream-K and PersistentKernel") if not state["ProblemType"]["StridedBatched"]: diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml new file mode 100644 index 00000000000..601f1637277 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_bgemm_quick.yaml @@ -0,0 +1,450 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1200, skip-gfx1201] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml new file mode 100644 index 00000000000..f9113cf9908 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx11/sk_hgemm_quick.yaml @@ -0,0 +1,450 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1200, skip-gfx1201] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + - LocalReadVectorWidth: [16] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 1, 1, 4, 1] + - [16, 16, 16, 1, 1, 2, 1, 4, 1] + - [16, 16, 16, 1, 1, 3, 1, 4, 1] + - [16, 16, 16, 1, 1, 4, 1, 4, 1] + - [16, 16, 16, 1, 1, 1, 1, 2, 2] + - [16, 16, 16, 1, 1, 2, 1, 2, 2] + - [16, 16, 16, 1, 1, 3, 1, 2, 2] + - [16, 16, 16, 1, 1, 5, 1, 2, 2] + - [16, 16, 16, 1, 1, 6, 1, 2, 2] + - [16, 16, 16, 1, 1, 7, 1, 2, 2] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 2, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - [16, 16, 16, 1, 1, 6, 1, 1, 4] + - [16, 16, 16, 1, 1, 7, 1, 1, 4] + - [16, 16, 16, 1, 1, 9, 1, 1, 4] + - [16, 16, 16, 1, 1, 0, 1, 1, 4] + - [16, 16, 16, 1, 1, 1, 1, 1, 4] + - [16, 16, 16, 1, 1, 3, 1, 1, 4] + - [16, 16, 16, 1, 1, 4, 1, 1, 4] + - [16, 16, 16, 1, 1, 5, 1, 1, 4] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml new file mode 100644 index 00000000000..ec1755ba557 --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_bgemm_quick.yaml @@ -0,0 +1,414 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] \ No newline at end of file diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml new file mode 100644 index 00000000000..c9aa0fe661c --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/streamk/gfx12/sk_hgemm_quick.yaml @@ -0,0 +1,414 @@ +TestParameters: + marks: [ skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + KernelTime: False + DataInitTypeAlpha: 1 + DataInitTypeBeta: 1 + DataInitTypeA: 12 + DataInitTypeB: 13 + DataInitTypeC: 12 + # DataInitTypeC: 1 + # ValidationPrintValids: True + MaxWorkspaceSize: 134217728 + # PrintSolutionRejectionReason: True + # ForceGenerateKernel: True + # GenerateSourcesAndExit: True + NumWarmups: 0 + EnqueuesPerSync: 1 + # NumBenchmarks: 10 + SleepPercent: 50 + +BenchmarkProblems: + + - # HGEMM NT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM NT - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM NT - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM NN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM NN - Test MatrixInstruction variants + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [4] + # - VectorWidthB: [4] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + - Exact: [4103, 4103, 1, 1031] + + - # HGEMM TN + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # HGEMM TN - Test DepthU, WGM, VW + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [1] + - DepthU: [ 32, 64 ] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [2, 4, 8] + - GlobalReadVectorWidthB: [2, 4, 8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [8] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - ScheduleIterAlg: [3] + - SourceSwap: [True] + # - StoreVectorWidth: [4] + - StreamK: [3] + - TransposeLDS: [0] + # - VectorWidthA: [1, 2, 4] + # - VectorWidthB: [1, 2, 4] + - WorkGroupMapping: [0, 1, 2, 4, 8, 16, 32, 64] # works + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] + + - # HGEMM TT + - # ProblemType + OperationType: GEMM + DataType: h + DestDataType: h + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # HGEMM TT - Test tuning params + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + ForkParameters: + - WavefrontSize: [32] + - 1LDSBuffer: [0, 1] + - DepthU: [64] + - ExpandPointerSwap: [False] + - GlobalReadVectorWidthA: [8] + - GlobalReadVectorWidthB: [8] + - GlobalSplitU: [0] + # - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16, 16, 16, 1, 1, 8,8, 2,2] + - [16, 16, 16, 1, 1, 8,4, 2,2] + - [16, 16, 16, 1, 1, 8,2, 2,2] + - [16, 16, 16, 1, 1, 8,1, 2,2] + - [16, 16, 16, 1, 1, 4,8, 2,2] + - [16, 16, 16, 1, 1, 4,4, 2,2] + - [16, 16, 16, 1, 1, 4,2, 2,2] + - [16, 16, 16, 1, 1, 4,1, 2,2] + - [16, 16, 16, 1, 1, 2,8, 2,2] + - [16, 16, 16, 1, 1, 2,4, 2,2] + - [16, 16, 16, 1, 1, 2,2, 2,2] + - [16, 16, 16, 1, 1, 2,1, 2,2] + - [16, 16, 16, 1, 1, 1,8, 2,2] + - [16, 16, 16, 1, 1, 1,4, 2,2] + - [16, 16, 16, 1, 1, 1,2, 2,2] + - [16, 16, 16, 1, 1, 1,1, 2,2] + - MIArchVgpr: [0] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [1] + - StaggerU: [0, 32] + - ScheduleIterAlg: [3] + - SourceSwap: [False, True] + # - StoreVectorWidth: [1] + - StreamK: [3] + - PrefetchLocalRead: [1, 3] + - NonTemporalC: [0, 7] + - NonTemporalD: [0, 7] + - TransposeLDS: [0, 1] + # - VectorWidthA: [1] + # - VectorWidthB: [1] + - WorkGroupMapping: [1] + + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + # - Exact: [4096, 4096, 1, 1024] + # - Exact: [4103, 4096, 1, 1024] + # - Exact: [4096, 4103, 1, 1024] + # - Exact: [4096, 4096, 1, 1031] + # - Exact: [4103, 4103, 1, 1031] + - Exact: [2055, 2055, 1, 1031] \ No newline at end of file diff --git a/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp b/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp index c5c0bc26126..2177f316933 100644 --- a/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp +++ b/projects/hipblaslt/tensilelite/include/Tensile/analytical/Hardware.hpp @@ -48,6 +48,7 @@ namespace TensileLite Int32, BFloat16, Int8, + Int4, Int64, XFloat32, Float8_fnuz, @@ -92,6 +93,8 @@ namespace TensileLite return 16; case DataType::Int8: return 8; + case DataType::Int4: + return 4; case DataType::Int64: return 64; case DataType::XFloat32: @@ -145,6 +148,8 @@ namespace TensileLite return "BFloat16"; case DataType::Int8: return "Int8"; + case DataType::Int4: + return "Int4"; case DataType::Int64: return "Int64"; case DataType::XFloat32: @@ -195,6 +200,8 @@ namespace TensileLite return DataType::BFloat16; if (s == "i8") return DataType::Int8; + if (s == "i4") + return DataType::Int4; if (s == "xf32") return DataType::XFloat32; if (s == "f8") @@ -287,6 +294,8 @@ namespace TensileLite gfx90a, gfx942, gfx950, + gfx1201, + gfx1100, Count }; @@ -295,7 +304,9 @@ namespace TensileLite static const std::unordered_map strToEnumMap = {{"gfx90a", Architecture::gfx90a}, {"gfx942", Architecture::gfx942}, - {"gfx950", Architecture::gfx950}}; + {"gfx950", Architecture::gfx950}, + {"gfx1201", Architecture::gfx1201}, + {"gfx1100", Architecture::gfx1100}}; auto it = strToEnumMap.find(str); if(it != strToEnumMap.end()) diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp index cc8ed19824e..1b0a3c10508 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/container.hpp @@ -102,15 +102,19 @@ namespace rocisa struct FLATModifiers : public Container { - FLATModifiers(int offset12 = 0, - bool glc = false, - bool slc = false, - bool lds = false, - bool isStore = false) + FLATModifiers(int offset12 = 0, + bool glc = false, + bool slc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool lds = false, + bool isStore = false) : Container() , offset12(offset12) , glc(glc) , slc(slc) + , dlc(dlc) + , scope(scope) , lds(lds) , isStore(isStore) { @@ -121,6 +125,8 @@ namespace rocisa , offset12(other.offset12) , glc(other.glc) , slc(other.slc) + , dlc(other.dlc) + , scope(other.scope) , lds(other.lds) , isStore(other.isStore) { @@ -133,20 +139,30 @@ namespace rocisa std::string toString() const override { - auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; std::string kStr; if(offset12 != 0) { kStr += " offset:" + std::to_string(offset12); } - if(glc) + if(hasGLCModifier && glc) { kStr += " " + getGlcBitName(hasGLCModifier); } - if(slc) + if(hasGLCModifier && slc) { kStr += " " + getSlcBitName(hasGLCModifier); } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); + } if(lds) { kStr += " lds"; @@ -154,27 +170,33 @@ namespace rocisa return kStr; } - int offset12; - bool glc; - bool slc; - bool lds; - bool isStore; + int offset12; + bool glc; + bool slc; + bool dlc; + CacheScope scope; + bool lds; + bool isStore; }; struct MUBUFModifiers : public Container { - MUBUFModifiers(bool offen = false, - int offset12 = 0, - bool glc = false, - bool slc = false, - bool nt = false, - bool lds = false, - bool isStore = false) + MUBUFModifiers(bool offen = false, + int offset12 = 0, + bool glc = false, + bool slc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool nt = false, + bool lds = false, + bool isStore = false) : Container() , offen(offen) , offset12(offset12) , glc(glc) , slc(slc) + , dlc(dlc) + , scope(scope) , nt(nt) , lds(lds) , isStore(isStore) @@ -187,6 +209,8 @@ namespace rocisa , offset12(other.offset12) , glc(other.glc) , slc(other.slc) + , dlc(other.dlc) + , scope(other.scope) , nt(other.nt) , lds(other.lds) , isStore(other.isStore) @@ -200,26 +224,32 @@ namespace rocisa std::string toString() const override { - auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; - auto hasSLCModifier = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"]; - auto hasNTModifier = rocIsa::getInstance().getAsmCaps()["HasNTModifier"]; + auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasSLCModifier = rocIsa::getInstance().getAsmCaps()["HasSLCModifier"]; + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; + auto hasNTModifier = rocIsa::getInstance().getAsmCaps()["HasNTModifier"]; std::string kStr; if(offen) { kStr += " offen offset:" + std::to_string(offset12); } - if(glc || slc || lds) - { - kStr += ","; - } - if(glc) + if(hasGLCModifier && glc) { kStr += " " + getGlcBitName(hasGLCModifier); } - if(slc) + if(hasGLCModifier && slc) { kStr += " " + getSlcBitName(hasGLCModifier); } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); + } if(hasNTModifier && nt) { kStr += " nt"; @@ -231,20 +261,28 @@ namespace rocisa return kStr; } - bool offen; - int offset12; - bool glc; - bool slc; - bool nt; - bool lds; - bool isStore; + bool offen; + int offset12; + bool glc; + bool slc; + bool dlc; + CacheScope scope; + bool nt; + bool lds; + bool isStore; }; struct SMEMModifiers : public Container { - SMEMModifiers(bool glc = false, bool nv = false, int offset = 0) + SMEMModifiers(bool glc = false, + bool dlc = false, + CacheScope scope = CacheScope::SCOPE_NONE, + bool nv = false, + int offset = 0) : Container() , glc(glc) + , dlc(dlc) + , scope(scope) , nv(nv) , offset(offset) // 20u 21s shaes the same { @@ -253,6 +291,8 @@ namespace rocisa SMEMModifiers(const SMEMModifiers& other) : Container() , glc(other.glc) + , dlc(other.dlc) + , scope(other.scope) , nv(other.nv) , offset(other.offset) { @@ -265,14 +305,25 @@ namespace rocisa std::string toString() const override { + auto hasGLCModifier = rocIsa::getInstance().getAsmCaps()["HasGLCModifier"]; + auto hasDLCModifier = rocIsa::getInstance().getAsmCaps()["HasDLCModifier"]; + auto hasSCOPEModifier = rocIsa::getInstance().getAsmCaps()["HasSCOPEModifier"]; std::string kStr; if(offset != 0) { kStr += " offset:" + std::to_string(offset); } - if(glc) + if(hasGLCModifier && glc) { - kStr += " glc"; + kStr += " " + getGlcBitName(hasGLCModifier); + } + if(hasDLCModifier && dlc) + { + kStr += " dlc"; + } + if(hasSCOPEModifier && scope != CacheScope::SCOPE_NONE) + { + kStr += " scope:" + ::rocisa::toString(scope); } if(nv) { @@ -281,9 +332,11 @@ namespace rocisa return kStr; } - bool glc; - bool nv; - int offset; + bool glc; + bool dlc; + CacheScope scope; + bool nv; + int offset; }; struct SDWAModifiers : public Container @@ -619,6 +672,7 @@ namespace rocisa bool isMinus; bool isAbs; bool isMacro; + bool isOff; RegisterContainer(const std::string& regType, const std::optional& regName, @@ -633,6 +687,7 @@ namespace rocisa , isMinus(false) , isAbs(false) , isMacro(false) + , isOff(false) { } @@ -640,6 +695,7 @@ namespace rocisa const std::optional& regName, bool isAbs, bool isMacro, + bool isOff, int regIdx = 0, float regNum = 1) : Container() @@ -651,6 +707,7 @@ namespace rocisa , isMinus(false) , isAbs(isAbs) , isMacro(isMacro) + , isOff(isOff) { } @@ -664,6 +721,7 @@ namespace rocisa , isMinus(other.isMinus) , isAbs(other.isAbs) , isMacro(other.isMacro) + , isOff(other.isOff) { } @@ -687,6 +745,7 @@ namespace rocisa , isMinus(other.isMinus) , isAbs(other.isAbs) , isMacro(other.isMacro) + , isOff(other.isOff) { } @@ -702,6 +761,7 @@ namespace rocisa isMinus = other.isMinus; isAbs = other.isAbs; isMacro = other.isMacro; + isOff = other.isOff; } return *this; } @@ -718,6 +778,7 @@ namespace rocisa isMinus = other.isMinus; isAbs = other.isAbs; isMacro = other.isMacro; + isOff = other.isOff; } return *this; } @@ -829,6 +890,11 @@ namespace rocisa std::string toString() const override { + if(isOff) + { + return "off"; + } + std::string minusStr = isMinus ? "-" : ""; minusStr = isAbs ? "abs(" + minusStr : minusStr; auto absStr = isAbs ? ")" : ""; @@ -1090,8 +1156,11 @@ namespace rocisa // Overloaded functions to create specific GPR containers with default regNum = 1.f std::shared_ptr vgpr(const Holder& holder, float regNum = 1.f); std::shared_ptr vgpr(int idx, float regNum = 1.f); - std::shared_ptr - vgpr(const std::string& name, float regNum = 1.f, bool isMacro = false, bool isAbs = false); + std::shared_ptr vgpr(const std::string& name, + float regNum = 1.f, + bool isMacro = false, + bool isAbs = false, + bool isOff = false); std::shared_ptr sgpr(const Holder& holder, float regNum = 1.f); std::shared_ptr sgpr(int idx, float regNum = 1.f); std::shared_ptr diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp index e1c748d4a2f..d5fa58d6c59 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/enum.hpp @@ -222,6 +222,15 @@ namespace rocisa UNUSED_PRESERVE = 3 }; + enum class CacheScope : int + { + SCOPE_NONE = 0, + SCOPE_CU = 1, + SCOPE_SE = 2, + SCOPE_DEV = 3, + SCOPE_SYS = 4, + }; + enum class CvtType : int { CVT_F16_to_F32 = 1, @@ -294,6 +303,23 @@ namespace rocisa } } + inline std::string toString(CacheScope scope) + { + switch(scope) + { + case CacheScope::SCOPE_CU: + return "SCOPE_CU"; + case CacheScope::SCOPE_SE: + return "SCOPE_SE"; + case CacheScope::SCOPE_DEV: + return "SCOPE_DEV"; + case CacheScope::SCOPE_SYS: + return "SCOPE_SYS"; + default: + return ""; + } + } + enum class SaturateCastType : int { NORMAL = 1, diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp index cee81dbf654..fe2f26fc10d 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/hardware_caps.hpp @@ -117,7 +117,8 @@ inline std::map = tryAssembler(isaVersion, assemblerPath, "v_lshl_or_b32 v47, v36, 0x2, v34", isDebug); rv["HasSMulHi"] = tryAssembler(isaVersion, assemblerPath, "s_mul_hi_u32 s47, s36, s34", isDebug); - + rv["HasScalarStore"] + = tryAssembler(isaVersion, assemblerPath, "s_store_dword s79, s[s70:s71], s77", isDebug); rv["HasMFMA_explictB"] = tryAssembler( isaVersion, assemblerPath, "v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]", isDebug); rv["HasMFMA"] = tryAssembler(isaVersion, @@ -270,6 +271,26 @@ inline std::map assemblerPath, "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, glc", isDebug); + rv["HasDLCModifier"] + = tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, dlc", + isDebug) + || tryAssembler(isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, dlc", + isDebug); + rv["HasSCOPEModifier"] + = tryAssembler( + isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], 0, offen offset:0, scope:SCOPE_DEV", + isDebug) + || tryAssembler( + isaVersion, + assemblerPath, + "buffer_load_dwordx4 v[10:13], v[0], s[0:3], null, offen offset:0, scope:SCOPE_DEV", + isDebug); rv["HasMUBUFConst"] = tryAssembler(isaVersion, assemblerPath, "buffer_load_dword v40, v36, s[24:27], 1 offen offset:0", diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp index 5c105cc63fe..8b21934b958 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/include/instruction/mem.hpp @@ -775,7 +775,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_U8, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -799,7 +799,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_U8, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -823,7 +823,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_B16, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -847,7 +847,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFReadInstruction( - InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_B16, dst, vaddr, saddr, soffset, mubuf, comment) { } @@ -1110,7 +1110,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_U8, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1134,7 +1134,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_U8, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1158,7 +1158,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_HI_B16, src, vaddr, saddr, soffset, mubuf, comment) { } @@ -1182,7 +1182,7 @@ namespace rocisa std::optional mubuf = std::nullopt, const std::string& comment = "") : MUBUFStoreInstruction( - InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment) + InstType::INST_D16_B16, src, vaddr, saddr, soffset, mubuf, comment) { } diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp index 966406f58f4..b9c8d246668 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/container.cpp @@ -105,10 +105,12 @@ namespace rocisa const std::string& name, float regNum = 1.f, bool isMacro = false, - bool isAbs = false) + bool isAbs = false, + bool isOff = false) { RegName regname = generateRegName(name); - return std::make_shared(gprType, regname, isAbs, isMacro, -1, regNum); + return std::make_shared( + gprType, regname, isAbs, isMacro, isOff, -1, regNum); } // Overloaded functions to create specific GPR containers with default regNum = 1.f @@ -123,9 +125,9 @@ namespace rocisa } std::shared_ptr - vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs) + vgpr(const std::string& name, float regNum, bool isMacro, bool isAbs, bool isOff) { - return createGPR("v", name, regNum, isMacro, isAbs); + return createGPR("v", name, regNum, isMacro, isAbs, isOff); } std::shared_ptr sgpr(const Holder& holder, float regNum) @@ -187,11 +189,12 @@ void init_containers(nb::module_ m) nb::arg("idx"), nb::arg("regNum") = 1.f); m_con.def("vgpr", - nb::overload_cast(&rocisa::vgpr), + nb::overload_cast(&rocisa::vgpr), nb::arg("name"), nb::arg("regNum") = 1.f, nb::arg("isMacro") = false, - nb::arg("isAbs") = false); + nb::arg("isAbs") = false, + nb::arg("isOff") = false); m_con.def("sgpr", nb::overload_cast(&rocisa::sgpr), @@ -276,10 +279,12 @@ void init_containers(nb::module_ m) std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); }); nb::class_(m_con, "FLATModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("offset12") = 0, nb::arg("glc") = false, nb::arg("slc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("lds") = false, nb::arg("isStore") = false) .def_rw("isStore", &rocisa::FLATModifiers::isStore) @@ -290,21 +295,34 @@ void init_containers(nb::module_ m) }) .def("__getstate__", [](const rocisa::FLATModifiers& self) { - return std::make_tuple(self.offset12, self.glc, self.slc, self.lds, self.isStore); + return std::make_tuple(self.offset12, + self.glc, + self.slc, + self.dlc, + self.scope, + self.lds, + self.isStore); }) - .def( - "__setstate__", - [](rocisa::FLATModifiers& self, std::tuple t) { - new(&self) rocisa::FLATModifiers( - std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); - }); + .def("__setstate__", + [](rocisa::FLATModifiers& self, + std::tuple t) { + new(&self) rocisa::FLATModifiers(std::get<0>(t), + std::get<1>(t), + std::get<2>(t), + std::get<3>(t), + std::get<4>(t), + std::get<5>(t), + std::get<6>(t)); + }); nb::class_(m_con, "MUBUFModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("offen") = false, nb::arg("offset12") = 0, nb::arg("glc") = false, nb::arg("slc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("nt") = false, nb::arg("lds") = false, nb::arg("isStore") = false) @@ -314,27 +332,37 @@ void init_containers(nb::module_ m) [](const rocisa::MUBUFModifiers& self, nb::dict&) { return rocisa::MUBUFModifiers(self); }) - .def( - "__getstate__", - [](const rocisa::MUBUFModifiers& self) { - return std::make_tuple( - self.offen, self.offset12, self.glc, self.slc, self.nt, self.lds, self.isStore); - }) + .def("__getstate__", + [](const rocisa::MUBUFModifiers& self) { + return std::make_tuple(self.offen, + self.offset12, + self.glc, + self.slc, + self.dlc, + self.scope, + self.nt, + self.lds, + self.isStore); + }) .def("__setstate__", - [](rocisa::MUBUFModifiers& self, - std::tuple t) { + [](rocisa::MUBUFModifiers& self, + std::tuple t) { new(&self) rocisa::MUBUFModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t), std::get<5>(t), - std::get<6>(t)); + std::get<6>(t), + std::get<7>(t), + std::get<8>(t)); }); nb::class_(m_con, "SMEMModifiers") - .def(nb::init(), + .def(nb::init(), nb::arg("glc") = false, + nb::arg("dlc") = false, + nb::arg("scope") = 0, nb::arg("nv") = false, nb::arg("offset") = 0) .def("__str__", &rocisa::SMEMModifiers::toString) @@ -344,11 +372,15 @@ void init_containers(nb::module_ m) }) .def("__getstate__", [](const rocisa::SMEMModifiers& self) { - return std::make_tuple(self.glc, self.nv, self.offset); + return std::make_tuple(self.glc, self.dlc, self.scope, self.nv, self.offset); }) - .def("__setstate__", [](rocisa::SMEMModifiers& self, std::tuple t) { - new(&self) rocisa::SMEMModifiers(std::get<0>(t), std::get<1>(t), std::get<2>(t)); - }); + .def( + "__setstate__", + [](rocisa::SMEMModifiers& self, + std::tuple t) { + new(&self) rocisa::SMEMModifiers( + std::get<0>(t), std::get<1>(t), std::get<2>(t), std::get<3>(t), std::get<4>(t)); + }); nb::class_(m_con, "SDWAModifiers") .def(nb::init(), diff --git a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp index d09ed7c4108..be435f90255 100644 --- a/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp +++ b/projects/hipblaslt/tensilelite/rocisa/rocisa/src/enum.cpp @@ -128,6 +128,14 @@ void init_enum(nb::module_ m) .value("UNUSED_PRESERVE", rocisa::UnusedBit::UNUSED_PRESERVE) .export_values(); + nb::enum_(m_enum, "CacheScope") + .value("SCOPE_NONE", rocisa::CacheScope::SCOPE_NONE) + .value("SCOPE_CU", rocisa::CacheScope::SCOPE_CU) + .value("SCOPE_SE", rocisa::CacheScope::SCOPE_SE) + .value("SCOPE_DEV", rocisa::CacheScope::SCOPE_DEV) + .value("SCOPE_SYS", rocisa::CacheScope::SCOPE_SYS) + .export_values(); + nb::enum_(m_enum, "CvtType") .value("CVT_F16_to_F32", rocisa::CvtType::CVT_F16_to_F32) .value("CVT_F32_to_F16", rocisa::CvtType::CVT_F32_to_F16) diff --git a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py index c550419085a..9fcb7ce6d78 100644 --- a/projects/hipblaslt/tensilelite/rocisa/test/test_container.py +++ b/projects/hipblaslt/tensilelite/rocisa/test/test_container.py @@ -41,15 +41,23 @@ def test_containers(): assert str(ds_modifiers) == " offset:2 gds" # Test FLATModifiers - flat_modifiers = rocisa.container.FLATModifiers(8, True, False, True, False) + flat_modifiers = rocisa.container.FLATModifiers( + offset12=8, glc=True, slc=False, dlc=False, + scope=rocisa.enum.CacheScope.SCOPE_NONE, + lds=True, isStore=False) assert str(flat_modifiers) == " offset:8 glc lds" # Test MUBUFModifiers - mubuf_modifiers = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True) - assert str(mubuf_modifiers) == " offen offset:12, glc" + mubuf_modifiers = rocisa.container.MUBUFModifiers( + offen=True, offset12=12, glc=True, slc=False, + dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nt=True, lds=False, isStore=True) + assert str(mubuf_modifiers) == " offen offset:12 glc" # Test SMEMModifiers - smem_modifiers = rocisa.container.SMEMModifiers(True, False, 8) + smem_modifiers = rocisa.container.SMEMModifiers( + glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nv=False, offset=8) assert str(smem_modifiers) == " offset:8 glc" # Test SDWAModifiers @@ -104,7 +112,7 @@ def test_containers(): # Test Holder from rocisa.container import vgpr, sgpr, accvgpr, mgpr, Holder - holder = Holder("holder"); + holder = Holder("holder") assert holder.idx == -1 assert str(holder.name) == "holder" assert isinstance(holder, Holder) @@ -120,6 +128,8 @@ def test_containers(): assert str(testGpr) == "acc4" testGpr = mgpr(holder) assert str(testGpr) == "m[mgprholder]" + testGpr = vgpr("off", isOff=True) + assert str(testGpr) == "off" def test_containers_copy(): def copy_test(name, obj): @@ -133,9 +143,14 @@ def copy_test(name, obj): copy_test("DSModifiers", ds) flat = rocisa.container.FLATModifiers(lds=True) copy_test("FLATModifiers", flat) - mubuf = rocisa.container.MUBUFModifiers(True, 12, True, False, True, False, True) + mubuf = rocisa.container.MUBUFModifiers( + offen=True, offset12=12, glc=True, slc=False, + dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nt=True, lds=False, isStore=True) copy_test("MUBUFModifiers", mubuf) - smem = rocisa.container.SMEMModifiers(True, False, 8) + smem = rocisa.container.SMEMModifiers( + glc=True, dlc=False, scope=rocisa.enum.CacheScope.SCOPE_NONE, + nv=False, offset=8) copy_test("SMEMModifiers", smem) sdwa = rocisa.container.SDWAModifiers( dst_sel=rocisa.enum.SelectBit.WORD_0, diff --git a/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp b/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp index c65a994e6bb..e355539cb66 100644 --- a/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp +++ b/projects/hipblaslt/tensilelite/src/analytical/Hardware.cpp @@ -40,7 +40,13 @@ namespace TensileLite 8, 17, 1.21875121875121875122 * 6, 4, 4, 1.5e-2, 1.5)}, {Hardware::Architecture::gfx950, Hardware::ArchitectureConstants( - 8, 17, 1.21875121875121875122 * 7, 6, 4, 0.008, 1.5)}}; + 8, 17, 1.21875121875121875122 * 7, 6, 4, 0.008, 1.5)}, + {Hardware::Architecture::gfx1201, + Hardware::ArchitectureConstants( + 1, 5.74, 1.21875121875121875122 * 2.41, 0.464, 2, 0.17, 1.5)}, + {Hardware::Architecture::gfx1100, + Hardware::ArchitectureConstants( + 1, 7.12, 1.21875121875121875122 * 3.48, 0.732, 2, 0.11, 1.5)}}; // Schema : (MI_M, MI_N, MI_K, DataType) const std::unordered_map