Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions projects/hipblaslt/tensilelite/Tensile/Components/GSU.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ def computeLoadSrd(self, writer, kernel, tP, stmp, tileStart):
_DepthU = kernel["_DepthU%s" % tc]
# swizzle
if (tP["isSwizzled"] and tc == 'A'):
_DepthU = (_DepthU * 16) # MI_M = 16
_DepthU = (_DepthU * 16)
elif (tP["isSwizzled"] and tc == 'B'):
_DepthU = (_DepthU * 16) # MI_N = 16
_DepthU = (_DepthU * 16)

gsucLabel = Label(label=writer.labels.getNameInc(f"GSUC_{tc}"), comment="")
gsucLabelEnd = Label(label=writer.labels.getNameInc(f"GSUC_{tc}_End"), comment="")
Expand Down
37 changes: 26 additions & 11 deletions projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,23 @@ def skExtraIters(self, writer, kernel, sSkExtraIters, sTmp):
return module

@abc.abstractmethod
def computeLoadSrd(self, writer, kernel, tc, sTmp):
def computeLoadSrd(self, writer, kernel, tP, sTmp):
pass

def computeLoadSrdCommon(self, writer, kernel, tc, sTmp):
def computeLoadSrdCommon(self, writer, kernel, tP, sTmp):
module = Module("StreamK Common computeLoadSrd")

tc = tP["tensorChar"]
_DepthU = kernel["_DepthU%s" % tc]
# swizzle
if (tP["isSwizzled"] and tc == 'A'):
_DepthU = (_DepthU * 16)
elif (tP["isSwizzled"] and tc == 'B'):
_DepthU = (_DepthU * 16)

tileStart = sTmp + 2
# StreamK partial tile - offset to tile start index
module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("StreamKLocalStart"), src1=kernel["DepthU"], comment="StreamK tile start offset"))
module.add(SMulI32(dst=sgpr(sTmp), src0=sgpr("StreamKLocalStart"), src1=_DepthU, comment="StreamK tile start offset"))
strideL = writer.strideRef(tc, kernel["ProblemType"]["IndicesSummation"][0])
module.add(writer.s_mul_u64_u32(sgpr(sTmp), sgpr(sTmp+1), sgpr(sTmp), strideL, comment="StreamK tile start offset"))
# Overflow check removed
Expand Down Expand Up @@ -255,9 +263,16 @@ def graAddressesCommon(self, writer, kernel, tP, vTmp):
module = Module("StreamK Common graAddresses")

tc = tP["tensorChar"]
_DepthU = kernel["_DepthU%s" % tc]
# swizzle
if (tP["isSwizzled"] and tc == 'A'):
_DepthU = (_DepthU * 16)
elif (tP["isSwizzled"] and tc == 'B'):
_DepthU = (_DepthU * 16)

# StreamK partial tile - offset to tile start index
tmpOffset = writer.sgprPool.checkOut(2, "skStartOffset")
module.add(SMulI32(dst=sgpr(tmpOffset), src0=sgpr("StreamKLocalStart"), src1=int(kernel["DepthU"] * tP["bpe"]), comment="StreamK tile start offset"))
module.add(SMulI32(dst=sgpr(tmpOffset), src0=sgpr("StreamKLocalStart"), src1=int(_DepthU * tP["bpe"]), comment="StreamK tile start offset"))
strideL = writer.strideRef(tc, kernel["ProblemType"]["IndicesSummation"][0])
module.add(writer.s_mul_u64_u32(sgpr(tmpOffset), sgpr(tmpOffset+1), sgpr(tmpOffset), strideL, "StreamK tile start offset"))
# Overflow check removed
Expand Down Expand Up @@ -1745,7 +1760,7 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):
module = Module("StreamK Off graWorkGroup")
return module

def computeLoadSrd(self, writer, kernel, tc, sTmp):
def computeLoadSrd(self, writer, kernel, tP, sTmp):
module = Module("StreamK Off computeLoadSrd")
return module

Expand Down Expand Up @@ -1861,9 +1876,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):

return module

def computeLoadSrd(self, writer, kernel, tc, sTmp):
def computeLoadSrd(self, writer, kernel, tP, sTmp):
module = Module("StreamK Basic computeLoadSrd")
module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
return module

def computeStoreSrdStart(self, writer, kernel):
Expand Down Expand Up @@ -1986,9 +2001,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):

return module

def computeLoadSrd(self, writer, kernel, tc, sTmp):
def computeLoadSrd(self, writer, kernel, tP, sTmp):
module = Module("StreamK TwoTileOriginal computeLoadSrd")
module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
return module

def computeStoreSrdStart(self, writer, kernel):
Expand Down Expand Up @@ -2256,9 +2271,9 @@ def graWorkGroup(self, writer, kernel, tPA, tPB):

return module

def computeLoadSrd(self, writer, kernel, tc, sTmp):
def computeLoadSrd(self, writer, kernel, tP, sTmp):
module = Module("StreamK TwoTileDPFirst computeLoadSrd")
module.add(self.computeLoadSrdCommon(writer, kernel, tc, sTmp))
module.add(self.computeLoadSrdCommon(writer, kernel, tP, sTmp))
return module

def computeStoreSrdStart(self, writer, kernel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3617,7 +3617,7 @@ def computeLoadSrd(self, kernel, tP, tc, indices, bpe):
strideF, comment="tlu=0, scaled tile-offset by stride"))

skComponent = Component.StreamK.find(self)
module.add(skComponent.computeLoadSrd(self, kernel, tc, stmp))
module.add(skComponent.computeLoadSrd(self, kernel, tP, stmp))

gsuComponent = Component.GSU.find(self)
module.add(gsuComponent.computeLoadSrd(self, kernel, tP, stmp, tileStart))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201]

GlobalParameters:
NumElementsToValidate: -1
MinimumRequiredVersion: 5.0.0
PrintLevel: 1
# PrintSolutionRejectionReason: True
Device: 0
CMakeBuildType: Debug
# MergeFiles: False
KernelTime: True
MaxWorkspaceSize: 13421772800
DataInitTypeA: 3
DataInitTypeB: 3
DataInitTypeC: 0
DataInitTypeMXSA: 3
DataInitTypeMXSB: 3
DataInitTypeAlpha: 1
DataInitTypeBeta: 0
NumElementsToValidate: -1
BoundsCheck: 0
KeepBuildTmp: True
MaxFileName: 128
DeviceLDS: 163840
MaxLDS: 163840

BenchmarkProblems:
########################################
# FP4SS
########################################
-
- # ProblemType
OperationType: GEMM
DataType: F4
DestDataType: S
ComputeDataType: S
HighPrecisionAccumulate: True
MXBlockA: 32
MXBlockB: 32
TransposeA: 1
TransposeB: 0
UseBeta: True
Batched: True
Activation: True
ActivationType: hipblaslt_all
# UseScaleAB: "Scalar"
# UseScaleCD: True
UseScaleAlphaVec: 1
UseBias: 1
BiasDataTypeList: [s]

- # BenchmarkProblemSizeGroup - Standard
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
# - [16, 16, 128, 1, 1, 1,1, 1,1]
# - [16, 16, 128, 1, 1, 4,2, 2,2]
- [16, 16, 128, 1, 1, 2,4, 2,2] # 64x128
# - [16, 16, 128, 1, 1, 8,8, 2,2]
# - [32, 32, 64, 1, 1, 1,1, 1,1]
- [32, 32, 64, 1, 1, 2,2, 2,2] # 128x128
# - [32, 32, 64, 1, 1, 4,2, 2,2]
# - [32, 32, 64, 1, 1, 2,4, 2,2]
- ForceDisableShadowInit: [True, False]
# - UseSgprForGRO: [0,1]
- UseSgprForGRO: [0]
# - DepthU: [64, 128]
- DepthU: [128]
- AssertFree0ElementMultiple: [1]
- AssertFree1ElementMultiple: [1]
- AssertSummationElementMultiple: [64]
- LocalReadVectorWidth: [16]
# - PrefetchGlobalRead: [0,1,2]
- PrefetchGlobalRead: [2]
# - PrefetchLocalRead: [0,1]
- PrefetchLocalRead: [1]
# - PreloadKernArgs: [0,1]
- PreloadKernArgs: [1]
# - ClusterLocalRead: [0,1]
- ClusterLocalRead: [1]
- VectorWidthA: [1]
- VectorWidthB: [1]
- GlobalReadVectorWidthA: [16]
- GlobalReadVectorWidthB: [16]
- ScheduleIterAlg: [3]
- InnerUnroll: [1]
- TransposeLDS: [1]
- WaveSeparateGlobalReadA: [0]
- WaveSeparateGlobalReadB: [0]
- 1LDSBuffer: [0]
- GlobalReadPerMfma: [1]
- LocalWritePerMfma: [-1]
- StoreVectorWidth: [-1]
- SourceSwap: [1]
- StreamK: [3]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Exact: [1025, 513, 1, 2048]
- Exact: [1044, 532, 1, 2048]
- Exact: [127, 127, 1, 640] #special cleanup case
- Exact: [128, 128, 1, 128]
- Exact: [129, 129, 1, 640]
- BiasTypeArgs: ['s']
- ActivationArgs:
- [Enum: none]
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201]

GlobalParameters:
NumElementsToValidate: -1
MinimumRequiredVersion: 5.0.0
PrintLevel: 1
# PrintSolutionRejectionReason: True
Device: 0
CMakeBuildType: Debug
# MergeFiles: False
KernelTime: True
MaxWorkspaceSize: 13421772800
DataInitTypeA: 3
DataInitTypeB: 3
DataInitTypeC: 0
DataInitTypeMXSA: 3
DataInitTypeMXSB: 3
DataInitTypeAlpha: 1
DataInitTypeBeta: 0
NumElementsToValidate: -1
BoundsCheck: 0
KeepBuildTmp: True
MaxFileName: 128
DeviceLDS: 163840
MaxLDS: 163840

BenchmarkProblems:
########################################
# FP8SS
########################################
-
- # ProblemType
OperationType: GEMM
DataType: F8
DestDataType: S
ComputeDataType: S
HighPrecisionAccumulate: True
MXBlockA: 32
MXBlockB: 32
TransposeA: 1
TransposeB: 0
UseBeta: True
Batched: True
Activation: True
ActivationType: hipblaslt_all
# UseScaleAB: "Scalar"
# UseScaleCD: True
UseScaleAlphaVec: 1
UseBias: 1
BiasDataTypeList: [s]

- # BenchmarkProblemSizeGroup - Standard
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
# - [16, 16, 128, 1, 1, 1,1, 1,1]
# - [16, 16, 128, 1, 1, 4,2, 2,2]
- [16, 16, 128, 1, 1, 2,4, 2,2] # 64x128
# - [16, 16, 128, 1, 1, 8,8, 2,2]
# - [32, 32, 64, 1, 1, 1,1, 1,1]
- [32, 32, 64, 1, 1, 2,2, 2,2] # 128x128
# - [32, 32, 64, 1, 1, 4,2, 2,2]
# - [32, 32, 64, 1, 1, 2,4, 2,2]
- ForceDisableShadowInit: [True, False]
# - UseSgprForGRO: [0,1]
- UseSgprForGRO: [0]
# - DepthU: [64, 128]
- DepthU: [128]
- AssertFree0ElementMultiple: [1]
- AssertFree1ElementMultiple: [1]
- AssertSummationElementMultiple: [64]
- LocalReadVectorWidth: [16]
# - PrefetchGlobalRead: [0,1,2]
- PrefetchGlobalRead: [2]
# - PrefetchLocalRead: [0,1]
- PrefetchLocalRead: [1]
# - PreloadKernArgs: [0,1]
- PreloadKernArgs: [1]
# - ClusterLocalRead: [0,1]
- ClusterLocalRead: [1]
- VectorWidthA: [1]
- VectorWidthB: [1]
- GlobalReadVectorWidthA: [16]
- GlobalReadVectorWidthB: [16]
- ScheduleIterAlg: [3]
- InnerUnroll: [1]
- TransposeLDS: [1]
- WaveSeparateGlobalReadA: [0]
- WaveSeparateGlobalReadB: [0]
- 1LDSBuffer: [0]
- GlobalReadPerMfma: [1]
- LocalWritePerMfma: [-1]
- StoreVectorWidth: [-1]
- SourceSwap: [1]
- StreamK: [3]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Exact: [1025, 513, 1, 2048]
- Exact: [1044, 532, 1, 2048]
- Exact: [127, 127, 1, 640] #special cleanup case
- Exact: [128, 128, 1, 128]
- Exact: [129, 129, 1, 640]
- BiasTypeArgs: ['s']
- ActivationArgs:
- [Enum: none]
Loading