diff --git a/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py b/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py index ca4cfde1f86..ced1f8c64a9 100644 --- a/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/ClientWriter.py @@ -209,6 +209,10 @@ def runNewClient(scriptPath, clientParametersPath, cxxCompiler: str, cCompiler: iniFile = "--config-file={}".format(clientParametersPath) args = [clientExe, iniFile] + # Add MX scale format if set + if globalParameters["MXScaleFormat"]: + args.append("--mx-scale-format={}".format(globalParameters["MXScaleFormat"])) + try: subprocess.run(args, check=True) except (subprocess.CalledProcessError, OSError) as e: @@ -324,8 +328,9 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC clientExe = getClientExecutablePath() timingFlag = " --timing-instrumentation" if globalParameters["TimingInstrumentation"] else "" + mxScaleFormatFlag = " --mx-scale-format={}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" for configFile in configPaths: - runScriptFile.write("{} --config-file {}{}\n".format(clientExe, configFile, timingFlag)) + runScriptFile.write("{} --config-file {}{}{}\n".format(clientExe, configFile, timingFlag, mxScaleFormatFlag)) runScriptFile.write("ERR2=$?\n\n") runScriptFile.write(""" @@ -347,8 +352,9 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC runScriptFile.write("%s -d 0 --resetclocks\n" % globalParameters["ROCmSMIPath"]) runScriptFile.write("%s -d 0 --setfan 50\n" % globalParameters["ROCmSMIPath"]) else: + mxScaleFormatFlag = " --mx-scale-format={}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else "" for configFile in configPaths: - runScriptFile.write("{} --config-file {} --best-solution 1\n".format(getClientExecutablePath(), configFile)) + runScriptFile.write("{} --config-file {} --best-solution 1{}\n".format(getClientExecutablePath(), configFile, mxScaleFormatFlag)) if os.name != "nt": runScriptFile.write("exit $ERR\n") diff --git a/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py b/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py index b9303231ff7..1e45469e0ee 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py +++ b/projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py @@ -137,6 +137,7 @@ ) globalParameters["LogicFormat"] = "yaml" # set library backend (yaml, or json) globalParameters["LibraryFormat"] = "yaml" # set library backend (yaml, or msgpack) +globalParameters["MXScaleFormat"] = 0 # MX scale data format (0=none, 1=pre-swizzle for GPU kernel layout) # True/False: CSV will/won't export WinnerGFlops, WinnerTimeUS, WinnerIdx, WinnerName. # TODO - if no side-effect, we can set default to True. This can make analyzing "LibraryLogic" (AddFromCSV) faster diff --git a/projects/hipblaslt/tensilelite/Tensile/Tensile.py b/projects/hipblaslt/tensilelite/Tensile/Tensile.py index 59151723811..8406edbcc1d 100644 --- a/projects/hipblaslt/tensilelite/Tensile/Tensile.py +++ b/projects/hipblaslt/tensilelite/Tensile/Tensile.py @@ -220,6 +220,8 @@ def splitExtraParameters(par): argParser.add_argument("--client-lock", default=None) argParser.add_argument("--prebuilt-client", default=str(TENSILE_CLIENT_PATH), type=os.path.abspath, help="Specify the full path to a pre-built tensilelite-client executable") + argParser.add_argument("--mx-scale-format", dest="MXScaleFormat", type=int, default=0, + help="MX scale data format (0=none, 1=pre-swizzle for GPU kernel layout)") argParser.add_argument("--global-parameters", nargs="+", type=splitExtraParameters, default=[]) @@ -246,6 +248,9 @@ def argUpdatedGlobalParameters(args): rv["ClientExecutionLockPath"] = args.client_lock if args.prebuilt_client: rv["PrebuiltClient"] = args.prebuilt_client + if args.MXScaleFormat: + print1("# Command-line override: MXScaleFormat") + rv["MXScaleFormat"] = args.MXScaleFormat for key, value in args.global_parameters: rv[key] = value diff --git a/projects/hipblaslt/tensilelite/client/include/DataInitialization.hpp b/projects/hipblaslt/tensilelite/client/include/DataInitialization.hpp index 9aa8857da1e..f6d8bbb31d9 100644 --- a/projects/hipblaslt/tensilelite/client/include/DataInitialization.hpp +++ b/projects/hipblaslt/tensilelite/client/include/DataInitialization.hpp @@ -45,6 +45,17 @@ namespace TensileLite { namespace Client { + inline bool isMXFP4Tensor(const TensorDescriptor& tensor, size_t mxBlock) + { + return tensor.dataType() == rocisa::DataType::Float4 && mxBlock > 0; + } + + inline bool isMXFP4Problem(const ContractionProblemGemm& problem) + { + return isMXFP4Tensor(problem.a(), problem.mxBlockA()) + || isMXFP4Tensor(problem.b(), problem.mxBlockB()); + } + // Problem-indept. from 0~7, and 16, and 23~26 (fixed values for every problem) // And problem-dept. from 8~15 (values depend on problem) // RandomNegPosLimited: integer -128~128. fp -1.0~1.0 @@ -832,9 +843,35 @@ namespace TensileLite } virtual void preBenchmarkRun() override {} virtual void postBenchmarkRun() override {} - virtual void preProblem(ContractionProblem* const problem) override {} + virtual void preProblem(ContractionProblem* const problem) override + { + m_currentGemmProblem + = dynamic_cast(problem); + } virtual void postProblem() override {} - virtual void preSolution(ContractionSolution* const solution) override {} + virtual void preSolution(ContractionSolution* const solution) override + { + m_currentSolution = solution; + // Re-init MX scale with preSwizzle now that solution (useScaleAB) is available + if(m_currentSolution != nullptr + && !m_currentSolution->problemType.useScaleAB.empty() + && m_currentGemmProblem != nullptr + && !m_gpuPtrs.empty()) + { + bool isMXFP4 = isMXFP4Problem(*m_currentGemmProblem); + if(isMXFP4) + { + initializeMXDataForFP4(*m_currentGemmProblem); + copyValidToGPUBuffer(*m_currentGemmProblem); + copyInputs(m_gpuPtrs, + m_gpuBatchPtrs, + m_maxElements, + m_groupedOffsets, + *m_currentGemmProblem, + hipMemcpyDeviceToDevice); + } + } + } virtual void postSolution() override {} virtual bool needMoreRunsInSolution() const override { @@ -1044,6 +1081,11 @@ namespace TensileLite int64_t m_rotatingBuffer = 0; std::shared_ptr m_rm; int32_t m_rotatingMode = 0; + + ContractionSolution const* m_currentSolution = nullptr; + ContractionProblemGemm const* m_currentGemmProblem = nullptr; + + int m_mxScaleFormat = 0; }; template <> diff --git a/projects/hipblaslt/tensilelite/client/main.cpp b/projects/hipblaslt/tensilelite/client/main.cpp index 702749b0b41..7d1d5a9f7e4 100644 --- a/projects/hipblaslt/tensilelite/client/main.cpp +++ b/projects/hipblaslt/tensilelite/client/main.cpp @@ -217,6 +217,7 @@ namespace TensileLite ("swizzle-tensor-b", po::value()->default_value(false), "Swizzle input tensor B.") ("mx-block-a", po::value()->default_value(0), "block of mx datatype input matrix A") ("mx-block-b", po::value()->default_value(0), "block of mx datatype input matrix B") + ("mx-scale-format", po::value()->default_value(0), "MX scale data format (0=none, 1=pre-swizzle for GPU kernel layout)") ("activation-compute-type", po::value()->default_value(rocisa::DataType::None), "Activation compute type.") ("high-precision-accumulate", po::value()->default_value(false), "Use high-precision accumulate.") ("sparse", po::value()->default_value(0), "A or B matrix is sparse matrix.") diff --git a/projects/hipblaslt/tensilelite/client/src/DataInitialization.cpp b/projects/hipblaslt/tensilelite/client/src/DataInitialization.cpp index 51df3f01c4c..43259a720c3 100644 --- a/projects/hipblaslt/tensilelite/client/src/DataInitialization.cpp +++ b/projects/hipblaslt/tensilelite/client/src/DataInitialization.cpp @@ -887,6 +887,7 @@ namespace TensileLite , m_keepPristineCopyOnGPU(args["pristine-on-gpu"].as()) , m_workspaceSize(problemFactory.workspaceSize()) , m_pruneMode(args["prune-mode"].as()) + , m_mxScaleFormat(args["mx-scale-format"].as()) { m_rotatingBuffer @@ -1697,8 +1698,7 @@ namespace TensileLite void DataInitialization::initializeCPUInputs(ContractionProblemGemm const& problem) { - bool useMXGenerator = (problem.a().dataType() == rocisa::DataType::Float4 && problem.mxBlockA() > 0) - || (problem.b().dataType() == rocisa::DataType::Float4 && problem.mxBlockB() > 0); + bool useMXGenerator = isMXFP4Problem(problem); if(useMXGenerator) initializeMXDataForFP4(problem); @@ -1774,10 +1774,49 @@ namespace TensileLite void DataInitialization::initializeMXDataForFP4(ContractionProblemGemm const& problem) { - std::vector emptySwizzle; - std::vector emptyTile; + // Compute preSwizzle parameters from the solution's matrix instruction to rearrange + // the scale tensor into the GPU kernel's expected memory layout + std::vector preSwizzleA, preTileA, preSwizzleB, preTileB; - if(problem.mxBlockA() > 0 && problem.a().dataType() == rocisa::DataType::Float4) + if(m_mxScaleFormat > 0 && m_currentSolution != nullptr + && !m_currentSolution->problemType.useScaleAB.empty()) + { + auto const& mi = m_currentSolution->sizeMapping.matrixInstruction; + size_t MiK = static_cast(mi[2]); + constexpr size_t swizzleTileMN = 32; // 2 SIMDs * 16 lanes per wave for MN access + constexpr size_t tileK = 256 / swizzleTileMN; // scale blocks per wave in K + + if(MiK > 0) + { + if(problem.mxBlockA() > 0 && MiK % problem.mxBlockA() == 0) + { + // scale tensor: scaleRows = sizes[0]/mxBlock, scaleCols = sizes[1] + // preSwizzle requires both to be multiples of their tile dimensions + size_t scaleRowsA = problem.a().sizes()[0] / problem.mxBlockA(); + size_t scaleColsA = problem.a().sizes()[1]; + if(scaleRowsA % tileK == 0 && scaleColsA % swizzleTileMN == 0) + { + size_t subTileK = MiK / problem.mxBlockA(); + preSwizzleA = {swizzleTileMN, tileK, subTileK}; + preTileA = {tileK, swizzleTileMN}; + } + } + + if(problem.mxBlockB() > 0 && MiK % problem.mxBlockB() == 0) + { + size_t scaleRowsB = problem.b().sizes()[0] / problem.mxBlockB(); + size_t scaleColsB = problem.b().sizes()[1]; + if(scaleRowsB % tileK == 0 && scaleColsB % swizzleTileMN == 0) + { + size_t subTileK = MiK / problem.mxBlockB(); + preSwizzleB = {swizzleTileMN, tileK, subTileK}; + preTileB = {tileK, swizzleTileMN}; + } + } + } + } + + if(isMXFP4Tensor(problem.a(), problem.mxBlockA())) { auto const& tensorA = problem.a(); auto rows = tensorA.sizes()[0]; @@ -1796,8 +1835,8 @@ namespace TensileLite cols, stride, problem.transA(), - emptySwizzle, - emptyTile, + preSwizzleA, + preTileA, problem.mxBlockA(), 1, true, @@ -1806,7 +1845,7 @@ namespace TensileLite 1.0f); } - if(problem.mxBlockB() > 0 && problem.b().dataType() == rocisa::DataType::Float4) + if(isMXFP4Tensor(problem.b(), problem.mxBlockB())) { auto const& tensorB = problem.b(); auto rows = tensorB.sizes()[0]; @@ -1825,8 +1864,8 @@ namespace TensileLite cols, stride, problem.transB(), - emptySwizzle, - emptyTile, + preSwizzleB, + preTileB, problem.mxBlockB(), 1, false, diff --git a/projects/hipblaslt/tensilelite/tests/MXDataGen_test.cpp b/projects/hipblaslt/tensilelite/tests/MXDataGen_test.cpp index 77cd7abf321..0fed9f0493c 100644 --- a/projects/hipblaslt/tensilelite/tests/MXDataGen_test.cpp +++ b/projects/hipblaslt/tensilelite/tests/MXDataGen_test.cpp @@ -165,3 +165,91 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(1024u, 213u, 32, true, true) // M=213, non-32-aligned (was failing) ) ); + +// ============================================================================ +// PreSwizzle scale tests +// +// Verify generateMXInput with preSwizzle produces scale data that is a +// permutation of the unswizzled layout. gfx950 FP4 MX kernels expect: +// preSwizzle = {swizzleTileMN=32, tileK=8, subTileK=MiK/mxBlock} +// preTile = {tileK=8, swizzleTileMN=32} +// swizzleTileMN=32 is fixed (2 SIMDs * 16 lanes); subTileK=4 for MiK=128, mxBlock=32. +// ============================================================================ + +// Params: {rows, cols, mxBlock, isTranspose, isMatrixA} +class MXPreSwizzleTest + : public ::testing::TestWithParam> +{ +}; + +/** @brief Verify preSwizzle produces a non-trivial permutation of scale data. */ +TEST_P(MXPreSwizzleTest, ScaleIsPermutationOfUnswizzled) +{ + auto [rows, cols, mxBlock, isTranspose, isMatrixA] = GetParam(); + + const std::vector preSwizzle = {32, 8, 4}; + const std::vector preTile = {8, 32}; + + const uint64_t numElements = rows * cols; + const uint64_t numPacked = (numElements + 1) / 2; + const size_t numScales = ((rows + mxBlock - 1) / mxBlock) * cols; + + std::vector dataNoShuf(numPacked, 0); + std::vector scaleNoShuf(numScales, 0); + std::vector dataShuf(numPacked, 0); + std::vector scaleShuf(numScales, 0); + + // Generate without preSwizzle + generateMXInput((hipDataType)HIP_R_4F_E2M1, + dataNoShuf.data(), + scaleNoShuf.data(), + rows, cols, rows, + isTranspose, + {}, {}, + mxBlock, 1, isMatrixA, + "Bounded", -1.0f, 1.0f); + + // Generate with preSwizzle + generateMXInput((hipDataType)HIP_R_4F_E2M1, + dataShuf.data(), + scaleShuf.data(), + rows, cols, rows, + isTranspose, + preSwizzle, preTile, + mxBlock, 1, isMatrixA, + "Bounded", -1.0f, 1.0f); + + // The scale buffers must be different + EXPECT_NE(scaleNoShuf, scaleShuf) + << "Scale data was not shuffled for " << rows << "x" << cols + << " (transpose=" << isTranspose << ", isMatrixA=" << isMatrixA << ")"; + + // The shuffled scale must be a permutation: same multiset of bytes + std::vector sortedNoShuf = scaleNoShuf; + std::vector sortedShuf = scaleShuf; + std::sort(sortedNoShuf.begin(), sortedNoShuf.end()); + std::sort(sortedShuf.begin(), sortedShuf.end()); + EXPECT_EQ(sortedNoShuf, sortedShuf) + << "Pre-shuffled scale is not a permutation of the unshuffled scale for " + << rows << "x" << cols; + + // Data buffer must be identical (preSwizzle only affects scale, not data) + EXPECT_EQ(dataNoShuf, dataShuf) + << "Data buffer changed unexpectedly with preSwizzle for " + << rows << "x" << cols; +} + +INSTANTIATE_TEST_SUITE_P( + FP4PreSwizzle, + MXPreSwizzleTest, + ::testing::Values( + // rows, cols, mxBlock, isTranspose, isMatrixA + // Test size constraints for preSwizzle {32,8,4} + preTile {8,32}: + // rows % 256 == 0 (scaleRows = rows/mxBlock must be divisible by tileK=8) + // cols % 32 == 0 (scaleCols must be divisible by swizzleTileMN=32) std::make_tuple(256u, 256u, 32, true, true), // scale A transposed + std::make_tuple(256u, 256u, 32, false, false), // scale B non-transposed + std::make_tuple(512u, 256u, 32, true, true), // larger scale A + std::make_tuple(256u, 512u, 32, false, false), // larger scale B + std::make_tuple(4096u, 16384u, 32, true, true) // benchmark-scale problem + ) +);