diff --git a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py index 3b5ad562e82..231968c11f9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py +++ b/projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py @@ -1388,7 +1388,7 @@ def fixupBatch(self, writer, kernel, ss, batchIdx, edge, gwvw, \ # AccVgpr write # if kernel.enabledSetPrioSplitLDS: # kStr += inst("s_setprio", "0", "") - if codeAccVgprWrite is not None: + if codeAccVgprWrite is not None and kernel["LocalSplitU"] == 1: regsPerScalar = writer.states.bpeCinternal // writer.states.bpr # register per scalar # loop over store instructions within one batch for elementIdx in range(0, len(batchElements)): diff --git a/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp b/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp index 47b848aa0fb..1e58902d91e 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp +++ b/projects/hipblaslt/tensilelite/Tensile/Source/lib/source/ContractionSolution.cpp @@ -2875,10 +2875,13 @@ namespace TensileLite } const bool streamKDP = Debug::Instance().useStreamKDataParrallel(); auto tiles = problem.getNumTiles(sizeMapping, gsu); - size_t skGrid = getSKGrid(problem, hardware, tiles); - // Get space required for partial tiles - if(tiles % skGrid != 0 && !streamKDP) - size += partialTileSize(skGrid); + if(tiles > 0) // Grouped GEMM reports 0 tiles + { + size_t skGrid = getSKGrid(problem, hardware, tiles); + // Get space required for partial tiles + if(skGrid > 0 && tiles % skGrid != 0 && !streamKDP) + size += partialTileSize(skGrid); + } } else { @@ -3189,6 +3192,9 @@ namespace TensileLite } } } + + if (tiles % skGrid != 0 && partialTileSize(skGrid) > problem.workspaceSize()) + skGrid = tiles; return skGrid; } // Limit the CUs Stream-K is launched on either max or the specified,