diff --git a/shared/rocroller/client/CMakeLists.txt b/shared/rocroller/client/CMakeLists.txt index f737b9967ee..5a6cf6c7d16 100644 --- a/shared/rocroller/client/CMakeLists.txt +++ b/shared/rocroller/client/CMakeLists.txt @@ -32,6 +32,7 @@ target_sources(rocroller-gemm "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GEMMSolution.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GraphInspector.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GraphInspector_impl.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/include/client/PreSwizzle.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/include/client/StreamKGEMMSolution.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/include/client/visualize.hpp" ) diff --git a/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp b/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp index 4dc3419022c..97ab43c98f9 100644 --- a/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp +++ b/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp @@ -133,11 +133,23 @@ namespace rocRoller m_tagLoadScaleA = command->addOperation( rocRoller::Operations::T_Load_Tiled(m_tagTensorScaleA.value())); + auto scaleInputA = m_tagLoadScaleA; + + if(solutionParams.types.scaleSkipPermlane) + { + AssertFatal(solutionParams.types.scaleShuffleTileA.size() == 3, + ShowValue(solutionParams.types.scaleShuffleTileA)); + + scaleInputA + = command->addOperation(rocRoller::Operations::SubTileTranspose( + *m_tagLoadScaleA, solutionParams.types.scaleShuffleTileA)); + } + m_tagBlockScaleA = mulInputA = command->addOperation(rocRoller::Operations::BlockScale( m_tagA, 2, - m_tagLoadScaleA, + scaleInputA, {1, static_cast(solutionParams.types.scaleBlockSize)})); } @@ -160,11 +172,22 @@ namespace rocRoller m_tagLoadScaleB = command->addOperation( rocRoller::Operations::T_Load_Tiled(m_tagTensorScaleB.value())); + auto scaleInputB = m_tagLoadScaleB; + + if(solutionParams.types.scaleSkipPermlane) + { + AssertFatal(solutionParams.types.scaleShuffleTileB.size() == 3); + + scaleInputB + = command->addOperation(rocRoller::Operations::SubTileTranspose( + *m_tagLoadScaleB, solutionParams.types.scaleShuffleTileB)); + } + m_tagBlockScaleB = mulInputB = command->addOperation(rocRoller::Operations::BlockScale( m_tagB, 2, - m_tagLoadScaleB, + scaleInputB, {static_cast(solutionParams.types.scaleBlockSize), 1})); } diff --git a/shared/rocroller/client/include/client/GEMMParameters.hpp b/shared/rocroller/client/include/client/GEMMParameters.hpp index b981b4b42c2..169d7f0b322 100644 --- a/shared/rocroller/client/include/client/GEMMParameters.hpp +++ b/shared/rocroller/client/include/client/GEMMParameters.hpp @@ -75,6 +75,10 @@ namespace rocRoller bool scaleSkipPermlane = false; + // Order: M/N, K tile, K subtile + std::vector scaleShuffleTileA; + std::vector scaleShuffleTileB; + std::string kernelNamePart() const; }; diff --git a/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp b/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp index 61bf28888a2..deaf056d6e8 100644 --- a/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp +++ b/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp @@ -65,6 +65,9 @@ namespace rocRoller::Serialization iot::mapRequired(io, "scaleBlockSize", params.scaleBlockSize); iot::mapRequired(io, "scaleSkipPermlane", params.scaleSkipPermlane); + + iot::mapRequired(io, "scaleShuffleTileA", params.scaleShuffleTileA); + iot::mapRequired(io, "scaleShuffleTileB", params.scaleShuffleTileB); } static void mapping(IO& io, Client::GEMMClient::TypeParameters& params, EmptyContext& ctx) diff --git a/shared/rocroller/client/include/client/GraphInspector.hpp b/shared/rocroller/client/include/client/GraphInspector.hpp index 23360b5883c..d301aa5acfe 100644 --- a/shared/rocroller/client/include/client/GraphInspector.hpp +++ b/shared/rocroller/client/include/client/GraphInspector.hpp @@ -121,6 +121,11 @@ namespace rocRoller KernelGraph::CoordinateGraph::Transformer& tx(); + KernelGraph::KernelGraphPtr graph() + { + return m_kgraph; + }; + private: void assignLiteralSizesAndStrides(); diff --git a/shared/rocroller/client/include/client/PreSwizzle.hpp b/shared/rocroller/client/include/client/PreSwizzle.hpp new file mode 100644 index 00000000000..42252a62001 --- /dev/null +++ b/shared/rocroller/client/include/client/PreSwizzle.hpp @@ -0,0 +1,87 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2024-2025 AMD ROCm(TM) Software + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include + +#include + +#include + +namespace rocRoller::Client +{ + + template + inline std::vector preSwizzle(std::vector const& input, + TensorDescriptor const& desc, + std::vector const& tile) + { + AssertFatal(tile.size() == 3, ShowValue(tile.size()), ShowValue(tile)); + AssertFatal(desc.dimensions() == 2, + "Batch dimension not yet supported.", + ShowValue(desc.dimensions()), + ShowValue(desc)); + AssertFatal(desc.totalAllocatedElements() == input.size(), + ShowValue(desc), + ShowValue(input.size())); + + auto tileMN = tile[0]; + auto tileK = tile[1]; + auto subTileK = tile[2]; + + size_t instPerTileK = tileK / subTileK; + size_t instKPerTileMN = tileMN / subTileK; + + std::vector srcSizes = {subTileK, + instPerTileK, + desc.size(0) / (tileK), + instKPerTileMN, + subTileK, + desc.size(1) / (tileMN)}; + + TensorDescriptor src(desc.dataType(), srcSizes); + + AssertFatal(src.totalAllocatedElements() == desc.totalAllocatedElements(), + ShowValue(src.totalAllocatedElements()), + ShowValue(desc.totalAllocatedElements()), + ShowValue(src.totalAllocatedElements() / desc.totalAllocatedElements()), + ShowValue(src), + ShowValue(desc)); + + auto dst + = TensorDescriptor::ShuffledNoPadding(desc.dataType(), srcSizes, {4, 1, 2, 3, 0, 5}); + + AssertFatal(src.totalAllocatedElements() == dst.totalAllocatedElements(), + ShowValue(src.totalAllocatedElements()), + ShowValue(dst.totalAllocatedElements()), + ShowValue(src), + ShowValue(dst)); + + return shuffleDims(input, dst, src); + } + +} diff --git a/shared/rocroller/client/src/GEMMParameters.cpp b/shared/rocroller/client/src/GEMMParameters.cpp index c6be0cfed18..7d3c797bf26 100644 --- a/shared/rocroller/client/src/GEMMParameters.cpp +++ b/shared/rocroller/client/src/GEMMParameters.cpp @@ -66,7 +66,9 @@ namespace rocRoller rv << "_" << t; if(scaleSkipPermlane) - rv << "_PRE_SW"; + { + rv << "_PreSW_AB"; + } return rv.str(); } diff --git a/shared/rocroller/client/src/gemm.cpp b/shared/rocroller/client/src/gemm.cpp index d0175cbd217..279d6d62370 100644 --- a/shared/rocroller/client/src/gemm.cpp +++ b/shared/rocroller/client/src/gemm.cpp @@ -47,6 +47,7 @@ #include "client/DataParallelGEMMSolution.hpp" #include "client/GEMMParameters.hpp" #include "client/GEMMParameters_serialization.hpp" +#include "client/PreSwizzle.hpp" #include "client/StreamKGEMMSolution.hpp" #include @@ -218,11 +219,47 @@ namespace rocRoller::Client::GEMMClient ShowValue(problemParams.types.scaleB)); if(problemParams.types.scaleA == Operations::ScaleMode::Separate) { - deviceScaleA = make_shared_device(hostScaleA); + if(problemParams.types.scaleSkipPermlane) + { + AssertFatal(problemParams.types.scaleShuffleTileA.size() == 3); + + auto descScaleA = descA.withNormalizedDimensions(); + { + auto sizes = descScaleA.sizes(); + sizes[0] /= problemParams.types.scaleBlockSize; + descScaleA = TensorDescriptor(descScaleA.dataType(), std::move(sizes)); + } + + auto tmpScaleA + = preSwizzle(hostScaleA, descScaleA, problemParams.types.scaleShuffleTileA); + deviceScaleA = make_shared_device(tmpScaleA); + } + else + { + deviceScaleA = make_shared_device(hostScaleA); + } } if(problemParams.types.scaleB == Operations::ScaleMode::Separate) { - deviceScaleB = make_shared_device(hostScaleB); + if(problemParams.types.scaleSkipPermlane) + { + AssertFatal(problemParams.types.scaleShuffleTileB.size() == 3); + + auto descScaleB = descB.withNormalizedDimensions(); + { + auto sizes = descScaleB.sizes(); + sizes[0] /= problemParams.types.scaleBlockSize; + descScaleB = TensorDescriptor(descScaleB.dataType(), std::move(sizes)); + } + + auto tmpScaleB + = preSwizzle(hostScaleB, descScaleB, problemParams.types.scaleShuffleTileB); + deviceScaleB = make_shared_device(tmpScaleB); + } + else + { + deviceScaleB = make_shared_device(hostScaleB); + } } std::cout << "Generating launch parameters and runtime arguments..." << std::endl; @@ -1396,16 +1433,12 @@ int main(int argc, const char* argv[]) AssertFatal(arch.HasCapability(GPUCapability::HasBlockScaling32), fmt::format("Architecture {} does not support block scaling.", arch.target().toString())); - types.scaleBlockSize = arch.GetCapability(GPUCapability::DefaultScaleBlockSize); - problem.types.scaleBlockSize = types.scaleBlockSize; + types.scaleBlockSize = arch.GetCapability(GPUCapability::DefaultScaleBlockSize); } AssertFatal((types.typeAcc == "float") || (types.typeAcc == "half") || (types.typeAcc == "bf16")); - problem.types = types; - solution.types = types; - // TODO: Reevaluate the relationship between problem and solution params. problem.workgroupMapping = solution.workgroupMapping; @@ -1417,8 +1450,8 @@ int main(int argc, const char* argv[]) // Set default MI sizes if(arch.HasCapability(GPUCapability::HasMFMA)) { - if(solution.types.typeA == "float" && solution.types.typeB == "float" - && solution.types.typeC == "float" && solution.types.typeD == "float") + if(types.typeA == "float" && types.typeB == "float" && types.typeC == "float" + && types.typeD == "float") { if(solution.waveM == -1) solution.waveM = 32; @@ -1429,7 +1462,7 @@ int main(int argc, const char* argv[]) if(solution.waveB == -1) solution.waveB = 1; } - else if(solution.types.typeA == "half" && solution.types.typeB == "half") + else if(types.typeA == "half" && types.typeB == "half") { if(solution.waveM == -1) solution.waveM = 32; @@ -1440,7 +1473,7 @@ int main(int argc, const char* argv[]) if(solution.waveB == -1) solution.waveB = 1; } - else if(solution.types.typeA == "bf16" && solution.types.typeB == "bf16") + else if(types.typeA == "bf16" && types.typeB == "bf16") { if(solution.waveM == -1) solution.waveM = 16; @@ -1451,8 +1484,8 @@ int main(int argc, const char* argv[]) if(solution.waveB == -1) solution.waveB = 1; } - else if((solution.types.typeA == "fp8" && solution.types.typeB == "fp8") - || (solution.types.typeA == "bf8" && solution.types.typeB == "bf8")) + else if((types.typeA == "fp8" && types.typeB == "fp8") + || (types.typeA == "bf8" && types.typeB == "bf8")) { if(solution.waveM == -1) solution.waveM = 16; @@ -1468,12 +1501,12 @@ int main(int argc, const char* argv[]) { if(arch.target().isRDNA4GPU()) { - if((solution.types.typeA == "half" && solution.types.typeB == "half") - || (solution.types.typeA == "bf16" && solution.types.typeB == "bf16") - || (solution.types.typeA == "fp8" && solution.types.typeB == "fp8") - || (solution.types.typeA == "bf8" && solution.types.typeB == "bf8") - || (solution.types.typeA == "bf8" && solution.types.typeB == "fp8") - || (solution.types.typeA == "fp8" && solution.types.typeB == "bf8")) + if((types.typeA == "half" && types.typeB == "half") + || (types.typeA == "bf16" && types.typeB == "bf16") + || (types.typeA == "fp8" && types.typeB == "fp8") + || (types.typeA == "bf8" && types.typeB == "bf8") + || (types.typeA == "bf8" && types.typeB == "fp8") + || (types.typeA == "fp8" && types.typeB == "bf8")) { if(solution.waveM == -1) solution.waveM = 16; @@ -1489,24 +1522,24 @@ int main(int argc, const char* argv[]) // Override default settings for the `example` and `generate` subcommands. if(example->parsed() || generate->parsed()) { - solution.types.typeA = "half"; - solution.types.typeB = "half"; - solution.types.typeC = "half"; - solution.types.typeD = "half"; - solution.waveM = 16; - solution.waveN = 16; - solution.waveK = 16; - solution.waveB = 1; + types.typeA = "half"; + types.typeB = "half"; + types.typeC = "half"; + types.typeD = "half"; + solution.waveM = 16; + solution.waveN = 16; + solution.waveK = 16; + solution.waveB = 1; } else { Throw("Unsupported MI on: ", arch.target().toString(), - ShowValue(solution.types.typeA), - ShowValue(solution.types.typeB), - ShowValue(solution.types.typeC), - ShowValue(solution.types.typeD), - ShowValue(solution.types.typeAcc)); + ShowValue(types.typeA), + ShowValue(types.typeB), + ShowValue(types.typeC), + ShowValue(types.typeD), + ShowValue(types.typeAcc)); } } // TODO Support prefetch on gfx12 @@ -1522,6 +1555,39 @@ int main(int argc, const char* argv[]) Throw("Unsupported arch for GEMM client: ", arch.target().toString()); } + if(types.scaleSkipPermlane) + { + AssertFatal(types.transA == Client::GEMMClient::TransposeType::T, ShowValue(types)); + AssertFatal(types.scaleA == Operations::ScaleMode::Separate, ShowValue(types)); + + size_t kSubtile = solution.waveK / types.scaleBlockSize; + + AssertFatal(kSubtile == 2 || kSubtile == 4, + ShowValue(kSubtile), + ShowValue(solution.waveK), + ShowValue(types.scaleBlockSize)); + + types.scaleShuffleTileA = {64, 4, kSubtile}; + } + + if(types.scaleSkipPermlane) + { + AssertFatal(types.transB == Client::GEMMClient::TransposeType::N, ShowValue(types)); + AssertFatal(types.scaleB == Operations::ScaleMode::Separate, ShowValue(types)); + + size_t kSubtile = solution.waveK / types.scaleBlockSize; + + AssertFatal(kSubtile == 2 || kSubtile == 4, + ShowValue(kSubtile), + ShowValue(solution.waveK), + ShowValue(types.scaleBlockSize)); + + types.scaleShuffleTileB = {64, 4, kSubtile}; + } + + problem.types = types; + solution.types = types; + // Set default prefetchMixMemOps if(prefetchMixMemOpsFlag->count() == 0) { @@ -1530,10 +1596,10 @@ int main(int argc, const char* argv[]) if(solution.prefetchLDSFactor != 0) solution.prefetchMixMemOps = true; - if(solution.types.scaleB == Operations::ScaleMode::Separate && !solution.loadLDSScaleB) + if(types.scaleB == Operations::ScaleMode::Separate && !solution.loadLDSScaleB) solution.prefetchMixMemOps = false; - if(solution.types.scaleA == Operations::ScaleMode::Separate && !solution.loadLDSScaleA) + if(types.scaleA == Operations::ScaleMode::Separate && !solution.loadLDSScaleA) solution.prefetchMixMemOps = false; // TODO: enable (prefetchMixMemOps == true && prefetchLDSFactor == 2 && direct2LDSA/B = true) diff --git a/shared/rocroller/client/test_gemm_client.py b/shared/rocroller/client/test_gemm_client.py index 90e49d0aa72..a9b2ee0bcc2 100644 --- a/shared/rocroller/client/test_gemm_client.py +++ b/shared/rocroller/client/test_gemm_client.py @@ -270,6 +270,8 @@ def client_arguments(self): scale_B: None scaleType_B: None scaleBlockSize: 0 + scaleShuffleTileA: [] + scaleShuffleTileB: [] scaleSkipPermlane: false streamK: false streamKTwoTile: false @@ -327,6 +329,8 @@ def client_arguments(self): scale_B: None scaleType_B: None scaleBlockSize: 0 + scaleShuffleTileA: [] + scaleShuffleTileB: [] scaleSkipPermlane: false loadLDSScale_A: false loadLDSScale_B: false @@ -382,6 +386,8 @@ def client_arguments(self): scale_B: None scaleType_B: None scaleBlockSize: 0 + scaleShuffleTileA: [] + scaleShuffleTileB: [] scaleSkipPermlane: false loadLDSScale_A: false loadLDSScale_B: false diff --git a/shared/rocroller/lib/include/CMakeLists.txt b/shared/rocroller/lib/include/CMakeLists.txt index 7fb1540a855..d17938edddf 100644 --- a/shared/rocroller/lib/include/CMakeLists.txt +++ b/shared/rocroller/lib/include/CMakeLists.txt @@ -33,6 +33,7 @@ target_sources(rocroller "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/ScheduledInstructions.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/ScheduledInstructions_fwd.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/TensorDescriptor.hpp" + "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/TensorDescriptor_impl.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/Assemblers/Assembler.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/Assemblers/Assembler_fwd.hpp" diff --git a/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp b/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp index bba66d9b3ab..77647d91a0b 100644 --- a/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp +++ b/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp @@ -450,6 +450,8 @@ namespace rocRoller Operations::ScaleMode scaleModeB = Operations::ScaleMode::None; std::vector scaleStridesA; std::vector scaleStridesB; + std::vector scalePreShuffledTileA; + std::vector scalePreShuffledTileB; VariableType accType = DataType::Float; std::string name() const; diff --git a/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp b/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp index e021ce64a16..c87206c1ed9 100644 --- a/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp +++ b/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp @@ -182,7 +182,7 @@ namespace rocRoller * * @ingroup KernelGraph */ - KernelGraph translate(CommandPtr); + KernelGraph translate(CommandPtr, CommandParametersPtr params = nullptr); /** * Generate assembly from a KernelGraph. diff --git a/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp b/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp index 0c0c0435147..e5a789e8e59 100644 --- a/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp +++ b/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp @@ -118,8 +118,8 @@ namespace rocRoller /** * Skip generation of permlane instructions when loading scale data. - * This is experimental and will cause the validation to fail, but will - * show better performance. + * This is experimental and requires that the input be specifically + * modified, but will show better performance. */ bool scaleSkipPermlane = false; diff --git a/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp b/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp index 90f49e319d2..b1545748ca6 100644 --- a/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp +++ b/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp @@ -59,7 +59,7 @@ namespace rocRoller explicit BlockScale(OperationTag data, int dimensions, std::optional scale = {}, - std::vector const& strides = {}); + std::vector strides = {}); std::unordered_set getInputs() const; std::string toString() const; @@ -78,5 +78,29 @@ namespace rocRoller template friend struct rocRoller::Serialization::MappingTraits; }; + + class SubTileTranspose : public BaseOperation + { + public: + SubTileTranspose() = delete; + + explicit SubTileTranspose(OperationTag input, std::vector tileDimensions); + + std::unordered_set getInputs() const; + std::string toString() const; + std::vector const& tileDimensions() const; + + auto operator<=>(SubTileTranspose const&) const = default; + bool operator==(SubTileTranspose const& other) const; + + OperationTag input() const; + + private: + OperationTag m_input; + std::vector m_tileDimensions; + + template + friend struct rocRoller::Serialization::MappingTraits; + }; } } diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp index 2ac6afa0462..45bf53e8a45 100644 --- a/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp +++ b/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp @@ -45,6 +45,8 @@ namespace rocRoller OperationTag getTag() const; void setTag(OperationTag tag); + std::strong_ordering operator<=>(BaseOperation const&) const; + protected: OperationTag m_tag; std::weak_ptr m_command; diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp index 54365e6beb4..a75ac7c5366 100644 --- a/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp +++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp @@ -48,7 +48,7 @@ namespace rocRoller { Nop() {} template - Nop(Args&&... i) + explicit Nop(Args&&... i) { } @@ -63,6 +63,7 @@ namespace rocRoller std::unordered_set operator()(Scalar const&); std::unordered_set operator()(Literal const&); std::unordered_set operator()(BlockScale const&); + std::unordered_set operator()(SubTileTranspose const&); std::unordered_set operator()(T_Load_Linear const&); std::unordered_set operator()(T_Load_Scalar const&); std::unordered_set operator()(T_Load_Tiled const&); @@ -87,6 +88,7 @@ namespace rocRoller std::unordered_set operator()(Scalar const&); std::unordered_set operator()(Literal const&); std::unordered_set operator()(BlockScale const&); + std::unordered_set operator()(SubTileTranspose const&); std::unordered_set operator()(T_Load_Linear const&); std::unordered_set operator()(T_Load_Scalar const&); std::unordered_set operator()(T_Load_Tiled const&); @@ -119,6 +121,7 @@ namespace rocRoller std::unordered_set operator()(Scalar&); std::unordered_set operator()(Literal&); std::unordered_set operator()(BlockScale&); + std::unordered_set operator()(SubTileTranspose&); std::unordered_set operator()(T_Load_Linear&); std::unordered_set operator()(T_Load_Scalar&); std::unordered_set operator()(T_Load_Tiled&); @@ -146,6 +149,7 @@ namespace rocRoller std::string operator()(Scalar const&); std::string operator()(Literal const&); std::string operator()(BlockScale const&); + std::string operator()(SubTileTranspose const&); std::string operator()(T_Load_Linear const&); std::string operator()(T_Load_Scalar const&); std::string operator()(T_Load_Tiled const&); @@ -175,6 +179,7 @@ namespace rocRoller void operator()(Scalar&); void operator()(Literal&); void operator()(BlockScale&); + void operator()(SubTileTranspose&); void operator()(T_Load_Linear&); void operator()(T_Load_Scalar&); void operator()(T_Load_Tiled&); @@ -196,6 +201,7 @@ namespace rocRoller void operator()(Scalar&); void operator()(Literal&); void operator()(BlockScale&); + void operator()(SubTileTranspose&); void operator()(T_Load_Linear&); void operator()(T_Load_Scalar&); void operator()(T_Load_Tiled&); @@ -215,6 +221,7 @@ namespace rocRoller rocRoller::VariableType operator()(Scalar&); rocRoller::VariableType operator()(Literal&); rocRoller::VariableType operator()(BlockScale&); + rocRoller::VariableType operator()(SubTileTranspose&); rocRoller::VariableType operator()(T_Load_Linear&); rocRoller::VariableType operator()(T_Load_Scalar&); rocRoller::VariableType operator()(T_Load_Tiled&); diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp index 1bf71bdbdc4..d19529ec075 100644 --- a/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp +++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp @@ -40,6 +40,7 @@ namespace rocRoller struct Scalar; struct Literal; struct BlockScale; + struct SubTileTranspose; struct T_Load_Linear; struct T_Load_Scalar; struct T_Load_Tiled; @@ -53,6 +54,7 @@ namespace rocRoller Scalar, Literal, BlockScale, + SubTileTranspose, T_Load_Linear, T_Load_Scalar, T_Load_Tiled, diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp index 2a2ab13efe4..0e6a5b0e949 100644 --- a/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp +++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp @@ -63,6 +63,11 @@ namespace rocRoller return blockScale.getInputs(); } + inline std::unordered_set Inputs::operator()(SubTileTranspose const& op) + { + return op.getInputs(); + } + inline std::unordered_set Inputs::operator()(T_Load_Linear const& load) { return {}; @@ -153,6 +158,11 @@ namespace rocRoller return {blockScale.getTag()}; } + inline std::unordered_set Outputs::operator()(SubTileTranspose const& op) + { + return {op.getTag()}; + } + inline std::unordered_set Outputs::operator()(T_Load_Linear const& load) { return {load.getTag()}; @@ -290,6 +300,17 @@ namespace rocRoller return {blockScale.getTag()}; } + inline std::unordered_set AssignOutputs::operator()(SubTileTranspose& op) + { + if(op.getTag().uninitialized()) + { + op.setTag(m_nextTagValue); + ++m_nextTagValue; + } + + return {op.getTag()}; + } + inline std::unordered_set AssignOutputs::operator()(T_Load_Linear& load) { if(load.getTag().uninitialized()) @@ -452,6 +473,11 @@ namespace rocRoller return blockScale.toString(); } + inline std::string ToStringVisitor::operator()(SubTileTranspose const& op) + { + return op.toString(); + } + inline std::string ToStringVisitor::operator()(T_Load_Linear const& load) { return load.toString(); @@ -547,6 +573,11 @@ namespace rocRoller blockScale.setCommand(command); } + inline void SetCommand::operator()(SubTileTranspose& op) + { + op.setCommand(command); + } + inline void SetCommand::operator()(T_Load_Linear& load) { load.setCommand(command); @@ -608,6 +639,8 @@ namespace rocRoller inline void AllocateArguments::operator()(BlockScale& blockScale) {} + inline void AllocateArguments::operator()(SubTileTranspose&) {} + inline void AllocateArguments::operator()(T_Load_Linear& load) {} inline void AllocateArguments::operator()(T_Load_Scalar& load) {} @@ -656,6 +689,12 @@ namespace rocRoller return {rocRoller::DataType::None}; } + inline rocRoller::VariableType + rocRoller::Operations::VariableTypeVisitor::operator()(SubTileTranspose&) + { + return {rocRoller::DataType::None}; + } + inline rocRoller::VariableType rocRoller::Operations::VariableTypeVisitor::operator()(T_Load_Linear& load) { @@ -734,6 +773,7 @@ namespace rocRoller RR_OPERATION_NAME(Scalar); RR_OPERATION_NAME(Literal); RR_OPERATION_NAME(BlockScale); + RR_OPERATION_NAME(SubTileTranspose); RR_OPERATION_NAME(T_Load_Linear); RR_OPERATION_NAME(T_Load_Scalar); RR_OPERATION_NAME(T_Load_Tiled); diff --git a/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp b/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp index 93994c1e9e7..ff8ebe872e0 100644 --- a/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp +++ b/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp @@ -413,12 +413,17 @@ namespace rocRoller using iot = IOTraits; static void mapping(IO& io, KernelGraph::ControlGraph::TensorContraction& op, Context&) { + static_assert(sizeof(op) == 160); + iot::mapRequired(io, "aDims", op.aDims); iot::mapRequired(io, "bDims", op.bDims); iot::mapRequired(io, "scaleModeA", op.scaleModeA); iot::mapRequired(io, "scaleModeB", op.scaleModeB); iot::mapRequired(io, "scaleStridesA", op.scaleStridesA); iot::mapRequired(io, "scaleStridesB", op.scaleStridesB); + iot::mapRequired(io, "scalePreShuffledTileA", op.scalePreShuffledTileA); + iot::mapRequired(io, "scalePreShuffledTileB", op.scalePreShuffledTileB); + iot::mapRequired(io, "accType", op.accType); } static void mapping(IO& io, KernelGraph::ControlGraph::TensorContraction& op) diff --git a/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp b/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp index b186a73d777..5084d68c992 100644 --- a/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp +++ b/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp @@ -169,6 +169,27 @@ namespace rocRoller } }; + template + struct MappingTraits + { + using TOp = Operations::SubTileTranspose; + using iot = IOTraits; + + static void mapping(IO& io, TOp& op, Context& ctx) + { + iot::mapRequired(io, "tag", op.m_tag); + iot::mapRequired(io, "input", op.m_input); + iot::mapRequired(io, "tileDimensions", op.m_tileDimensions); + } + + static void mapping(IO& io, TOp& val) + { + AssertFatal((std::same_as)); + Context ctx; + mapping(io, val, ctx); + } + }; + template struct MappingTraits { @@ -419,6 +440,15 @@ namespace rocRoller } }; + template <> + struct DefaultConstruct + { + static Operations::Operation call() + { + return Operations::SubTileTranspose(Operations::OperationTag(-1), {}); + } + }; + template <> struct DefaultConstruct { diff --git a/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp b/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp index d3cac75c4b3..a2676d0ea92 100644 --- a/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp +++ b/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp @@ -31,9 +31,41 @@ #include #include #include +#include namespace rocRoller { + /** + * [sizeBegin,sizeEnd): A range of values representing sizes of dimensions. + * Returns the number of coordinates within that space, + * i.e. the product of those values. + */ + template + size_t CoordCount(SizeIter sizeBegin, SizeIter sizeEnd); + + /** + * [sizeBegin,sizeEnd): A range of values representing sizes of dimensions. + * Writes into [coordBegin:coordEnd) the coordinates numbered `num` within + * a linearization of all coordinates with earlier values being + * faster-moving. + */ + template + void CoordNumbered( + size_t num, CoordIter coordBegin, CoordIter coordEnd, SizeIter sizeBegin, SizeIter sizeEnd); + + /** + * If [coordBegin, coordEnd) represents coordinate x within the + * linearization, updates it to represent coordinate x+1. + * + * If [coordBegin, coordEnd) represents the last coordinate within the + * linearization, it will be reset to all 0s and false will be returned. + */ + template + bool IncrementCoord(CoordIter coordBegin, + CoordIter coordEnd, + SizeIter sizeBegin, + SizeIter sizeEnd); + /* * Describes a tensor including dimensions, memory layout, and data type. * Decoupled from any particular pointer value or memory location. @@ -41,10 +73,7 @@ namespace rocRoller class TensorDescriptor { public: - TensorDescriptor() - { - this->calculate(); - } + TensorDescriptor(); template TensorDescriptor(DataType t, @@ -52,201 +81,127 @@ namespace rocRoller IterA sizesEnd, IterB stridesBegin, IterB stridesEnd, - size_t offset = 0) - : m_sizes(sizesBegin, sizesEnd) - , m_strides(stridesBegin, stridesEnd) - , m_dataType(t) - , m_offset(offset) - { - this->calculate(); - } + size_t offset = 0); template - TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset = 0) - : m_sizes(sizesBegin, sizesEnd) - , m_dataType(t) - , m_offset(offset) - { - this->calculate(); - } - - /* - * Allow directly specifying total number of elements instead of sizes - */ + TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset = 0); + + /** + * Allow directly specifying total number of elements instead of sizes + */ TensorDescriptor(DataType t, size_t totalLogicalElements, std::initializer_list strides, - size_t offset = 0) - : m_totalLogicalElements(totalLogicalElements) - , m_strides(strides) - , m_dataType(t) - , m_offset(offset) - { - this->calculate(); - } - - TensorDescriptor(DataType t, std::initializer_list sizes, size_t offset = 0) - : m_sizes(sizes) - , m_dataType(t) - , m_offset(offset) - - { - this->calculate(); - } + size_t offset = 0); + + TensorDescriptor(DataType t, std::initializer_list sizes, size_t offset = 0); + + TensorDescriptor(DataType t, std::vector sizes, size_t offset = 0); TensorDescriptor(DataType t, std::initializer_list sizes, std::initializer_list strides, - size_t offset = 0) - : m_sizes(sizes) - , m_strides(strides) - , m_dataType(t) - , m_offset(offset) - { - this->calculate(); - } - - // Specialized constructor for 2-D tensor (i.e., matrix) + size_t offset = 0); + + TensorDescriptor(DataType t, + std::vector sizes, + std::vector strides, + size_t offset = 0); + + /** + * Specialized constructor for 2-D tensor (i.e., matrix) + */ TensorDescriptor(DataType t, std::array sizes, std::string const& transpose, - size_t offset = 0) - : m_sizes(sizes.begin(), sizes.end()) - , m_dataType(t) - , m_offset(offset) - { - if(transpose == "T") - { - m_strides = {m_sizes[1], 1u}; - } - else - { - m_strides = {1u, m_sizes[0]}; - } - this->calculate(); - } - - void calculate() - { - if(m_strides.size() < m_sizes.size()) - { - m_strides.resize(m_sizes.size(), UseDefaultStride); - if(m_strides[0] == UseDefaultStride) - { - m_strides[0] = 1; - } - } - - // Calculate total number of logical elements and update strides - if(not m_sizes.empty()) - { - m_totalLogicalElements = m_sizes[0]; - } - for(int i = 1; i < m_sizes.size(); i++) - { - m_totalLogicalElements *= m_sizes[i]; - if(m_strides[i] == UseDefaultStride) - { - m_strides[i] = m_strides[i - 1] * m_sizes[i - 1]; - } - } - - // Calculate total number of allocated elements - if(not m_sizes.empty()) - { - m_totalAllocatedElements = 1; - for(size_t i = 0; i < m_sizes.size(); i++) - m_totalAllocatedElements += m_strides[i] * (m_sizes[i] - 1); - } - else - { - m_totalAllocatedElements = m_totalLogicalElements; - } - m_totalAllocatedElements += m_offset; - } - - const size_t size(size_t index) const - { - return m_sizes[index]; - } - const std::vector& sizes() const - { - return m_sizes; - } - const size_t stride(size_t index) const - { - return m_strides[index]; - } - const std::vector& strides() const - { - return m_strides; - } - - size_t offset() const - { - return m_offset; - } - - size_t dimensions() const - { - return m_sizes.size(); - } - size_t totalLogicalElements() const - { - return m_totalLogicalElements; - } - size_t totalAllocatedElements() const - { - return m_totalAllocatedElements; - } - size_t elementBytes() const - { - return DataTypeInfo::Get(m_dataType).elementBytes; - } - - DataType dataType() const - { - return m_dataType; - } - - bool operator==(const TensorDescriptor& rhs) const - { - return m_dataType == rhs.m_dataType && m_sizes == rhs.m_sizes - && m_strides == rhs.m_strides && m_offset == rhs.m_offset; - } - - bool operator!=(const TensorDescriptor& rhs) const - { - return !(*this == rhs); - } - - std::string toString() const - { - std::ostringstream result; - - auto join = [&](std::vector const& items) { - if(items.empty()) - return; - - result << "("; - auto last_item = std::prev(items.end()); - for(auto iter = items.begin(); iter != last_item; iter++) - result << *iter << ","; - result << *last_item << ")"; - }; - - result << dimensions() << "-tensor<" << dataType() << "> "; - join(m_sizes); - join(m_strides); - result << " offset: " << m_offset; - return result.str(); - } + size_t offset = 0); + + static TensorDescriptor + ShuffledNoPadding(DataType t, std::vector sizes, std::vector dimOrder); + + static TensorDescriptor ShuffledNoPadding(DataType t, + std::initializer_list sizes, + std::vector dimOrder); + + static TensorDescriptor ShuffledNoPadding(DataType t, + std::vector sizes, + std::initializer_list dimOrder); + + static TensorDescriptor ShuffledNoPadding(DataType t, + std::initializer_list sizes, + std::initializer_list dimOrder); + + inline void calculate(); + + const size_t size(size_t index) const; + + const std::vector& sizes() const; + + const size_t stride(size_t index) const; + + const std::vector& strides() const; + + size_t offset() const; + + size_t dimensions() const; + + size_t totalLogicalElements() const; + + size_t totalAllocatedElements() const; + + size_t totalAllocatedBytes() const; + + size_t elementBytes() const; + + /** + * Returns the number of elements of padding in the given dimension (0 if + * unpadded). May be negative if stride is less than size + */ + int64_t dimensionPadding(size_t dim) const; + + /** + * Collapses dimensions in the interval [begin, end). + * + * preconditions: + * - end >= begin + * - begin < dimensions() + * - end <= dimensions() + * - dimensions in the interval [begin, end-1) are not padded. + * + * postconditions: + * - dimensions() is diminished by end-begin + * - total elements (allocated and logical) remain the same + * - dimension 'begin' is the product of all the dimensions in the interval + * [begin, end). + */ + void collapseDims(size_t begin, size_t end); + + DataType dataType() const; + + bool operator==(const TensorDescriptor& rhs) const; + + bool operator!=(const TensorDescriptor& rhs) const; + + std::string toString() const; + + template + inline size_t index(Container const& indices) const; + + template + inline size_t index(std::initializer_list indices) const; + + template + inline size_t index(Ts... is) const; + + inline bool incrementCoord(std::vector& coord, size_t firstDimension = 0) const; + + TensorDescriptor withNormalizedDimensions() const; friend std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t); - private: static inline const size_t UseDefaultStride = -1; + private: std::vector m_sizes; std::vector m_strides; size_t m_offset = 0; @@ -261,17 +216,24 @@ namespace rocRoller inline void setCommandTensorArg(rocRoller::CommandArguments& commandArgs, rocRoller::Operations::OperationTag const& tag, TensorDescriptor& desc, - T value) - { - commandArgs.setArgument(tag, ArgumentType::Value, value); - commandArgs.setArgument(tag, ArgumentType::Limit, desc.totalLogicalElements()); - - auto const& sizes = desc.sizes(); - for(size_t i = 0; i < sizes.size(); i++) - commandArgs.setArgument(tag, ArgumentType::Size, i, sizes[i]); - - auto const& strides = desc.strides(); - for(size_t i = 0; i < strides.size(); i++) - commandArgs.setArgument(tag, ArgumentType::Stride, i, (size_t)strides[i]); - } + T value); + + template + std::string writeTensor(std::vector const& data, TensorDescriptor desc); + + /** + * `dst` and `src` must be two TensorDescriptors with the same data type + * and dimension sizes (i.e. dst.sizes() == src.sizes()). They should have + * different strides (or this function is a no-op). + * + * `input` must contain data arranged according to `src`. + * + * Returns `input` rearranged according to the strides in `dst`. + */ + template + inline std::vector shuffleDims(std::vector const& input, + TensorDescriptor const& dst, + TensorDescriptor const& src); } + +#include "TensorDescriptor_impl.hpp" diff --git a/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp b/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp new file mode 100644 index 00000000000..9b7f4ce06e7 --- /dev/null +++ b/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp @@ -0,0 +1,539 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright 2024-2025 AMD ROCm(TM) Software + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include + +#include + +#include +#include +#include +#include + +namespace rocRoller +{ + template + inline size_t CoordCount(SizeIter sizeBegin, SizeIter sizeEnd) + { + size_t rv = 1; + + while(sizeBegin != sizeEnd) + { + rv *= *sizeBegin; + sizeBegin++; + } + + return rv; + } + + template + inline void CoordNumbered( + size_t num, CoordIter coordBegin, CoordIter coordEnd, SizeIter sizeBegin, SizeIter sizeEnd) + { + auto coord = coordBegin; + auto size = sizeBegin; + + while(coord != coordEnd && size != sizeEnd) + { + *coord = num % *size; + num /= *size; + + coord++; + size++; + } + + if(coord != coordEnd || size != sizeEnd) + throw std::runtime_error("Inconsistent size of coordinates."); + } + + template + inline bool IncrementCoord(CoordIter coordBegin, + CoordIter coordEnd, + SizeIter sizeBegin, + SizeIter sizeEnd) + { + auto coord = coordBegin; + auto size = sizeBegin; + + while(coord != coordEnd) + { + (*coord)++; + if(*coord < *size) + return true; + + *coord = 0; + + coord++; + size++; + } + + return false; + } + + inline TensorDescriptor::TensorDescriptor() + { + this->calculate(); + } + + template + TensorDescriptor::TensorDescriptor(DataType t, + IterA sizesBegin, + IterA sizesEnd, + IterB stridesBegin, + IterB stridesEnd, + size_t offset) + : m_sizes(sizesBegin, sizesEnd) + , m_strides(stridesBegin, stridesEnd) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + template + TensorDescriptor::TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset) + : m_sizes(sizesBegin, sizesEnd) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + inline TensorDescriptor::TensorDescriptor(DataType t, + size_t totalLogicalElements, + std::initializer_list strides, + size_t offset) + : m_totalLogicalElements(totalLogicalElements) + , m_strides(strides) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + inline TensorDescriptor::TensorDescriptor(DataType t, + std::initializer_list sizes, + size_t offset) + : m_sizes(sizes) + , m_dataType(t) + , m_offset(offset) + + { + this->calculate(); + } + + inline TensorDescriptor::TensorDescriptor(DataType t, std::vector sizes, size_t offset) + : m_sizes(std::move(sizes)) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + inline TensorDescriptor::TensorDescriptor(DataType t, + std::initializer_list sizes, + std::initializer_list strides, + size_t offset) + : m_sizes(sizes) + , m_strides(strides) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + inline TensorDescriptor::TensorDescriptor(DataType t, + std::vector sizes, + std::vector strides, + size_t offset) + : m_sizes(std::move(sizes)) + , m_strides(std::move(strides)) + , m_dataType(t) + , m_offset(offset) + { + this->calculate(); + } + + // Specialized constructor for 2-D tensor (i.e., matrix) + inline TensorDescriptor::TensorDescriptor(DataType t, + std::array sizes, + std::string const& transpose, + size_t offset) + : m_sizes(sizes.begin(), sizes.end()) + , m_dataType(t) + , m_offset(offset) + { + if(transpose == "T") + { + m_strides = {m_sizes[1], 1u}; + } + else + { + m_strides = {1u, m_sizes[0]}; + } + this->calculate(); + } + + inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(DataType t, + std::vector sizes, + std::vector dimOrder) + { + AssertFatal(sizes.size() == dimOrder.size(), ShowValue(sizes), ShowValue(dimOrder)); + + std::vector strides(sizes.size(), 0); + + size_t stride = 1; + for(auto idx : dimOrder) + { + strides.at(idx) = stride; + stride *= sizes.at(idx); + } + + return TensorDescriptor(t, std::move(sizes), std::move(strides)); + } + + inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(DataType t, + std::initializer_list sizes, + std::vector dimOrder) + { + std::vector theSizes(std::move(sizes)); + return ShuffledNoPadding(t, std::move(theSizes), std::move(dimOrder)); + } + + inline TensorDescriptor TensorDescriptor::ShuffledNoPadding( + DataType t, std::vector sizes, std::initializer_list dimOrder) + { + std::vector theDimOrder(std::move(dimOrder)); + return ShuffledNoPadding(t, std::move(sizes), std::move(theDimOrder)); + } + + inline TensorDescriptor TensorDescriptor::ShuffledNoPadding( + DataType t, std::initializer_list sizes, std::initializer_list dimOrder) + { + std::vector theSizes(std::move(sizes)); + std::vector theDimOrder(std::move(dimOrder)); + + return ShuffledNoPadding(t, std::move(theSizes), std::move(theDimOrder)); + } + + inline void TensorDescriptor::calculate() + { + if(m_sizes.empty()) + { + m_strides = m_sizes; + m_totalLogicalElements = 0; + m_totalAllocatedElements = 0; + return; + } + + m_strides.resize(m_sizes.size(), UseDefaultStride); + if(m_strides[0] == UseDefaultStride) + { + m_strides[0] = 1; + } + m_totalLogicalElements = m_sizes[0]; + + for(int i = 1; i < m_sizes.size(); i++) + { + m_totalLogicalElements *= m_sizes[i]; + + if(m_strides[i] == UseDefaultStride) + { + m_strides[i] = m_strides[i - 1] * m_sizes[i - 1]; + } + } + + m_totalAllocatedElements = 1; + for(int i = 0; i < m_sizes.size(); i++) + m_totalAllocatedElements += m_strides[i] * (m_sizes[i] - 1); + + m_totalAllocatedElements += m_offset; + } + + inline const size_t TensorDescriptor::size(size_t index) const + { + return m_sizes[index]; + } + inline const std::vector& TensorDescriptor::sizes() const + { + return m_sizes; + } + inline const size_t TensorDescriptor::stride(size_t index) const + { + return m_strides[index]; + } + inline const std::vector& TensorDescriptor::strides() const + { + return m_strides; + } + + inline size_t TensorDescriptor::offset() const + { + return m_offset; + } + + inline size_t TensorDescriptor::dimensions() const + { + return m_sizes.size(); + } + inline size_t TensorDescriptor::totalLogicalElements() const + { + return m_totalLogicalElements; + } + inline size_t TensorDescriptor::totalAllocatedElements() const + { + return m_totalAllocatedElements; + } + inline size_t TensorDescriptor::totalAllocatedBytes() const + { + return totalAllocatedElements() * elementBytes(); + } + inline size_t TensorDescriptor::elementBytes() const + { + return DataTypeInfo::Get(m_dataType).elementBytes; + } + + inline DataType TensorDescriptor::dataType() const + { + return m_dataType; + } + + inline bool TensorDescriptor::operator==(const TensorDescriptor& rhs) const + { + return m_dataType == rhs.m_dataType && m_sizes == rhs.m_sizes && m_strides == rhs.m_strides + && m_offset == rhs.m_offset; + } + + inline bool TensorDescriptor::operator!=(const TensorDescriptor& rhs) const + { + return !(*this == rhs); + } + + inline std::string TensorDescriptor::toString() const + { + std::ostringstream result; + + auto join = [&](std::vector const& items) { + if(items.empty()) + return; + + result << "("; + auto last_item = std::prev(items.end()); + for(auto iter = items.begin(); iter != last_item; iter++) + result << *iter << ","; + result << *last_item << ")"; + }; + + result << dimensions() << "-tensor<" << dataType() << "> "; + join(m_sizes); + join(m_strides); + result << " offset: " << m_offset; + return result.str(); + } + + template + inline size_t TensorDescriptor::index(Container const& indices) const + { + if(indices.size() != dimensions()) + throw std::runtime_error("Incorrect number of indices."); + + for(int i = 0; i < indices.size(); i++) + if(indices[i] >= m_sizes[i]) + throw std::runtime_error("Index out of bounds."); + + return std::inner_product(indices.begin(), indices.end(), m_strides.begin(), m_offset); + } + + template + inline size_t TensorDescriptor::index(std::initializer_list indices) const + { + if(indices.size() != dimensions()) + throw std::runtime_error("Incorrect number of indices."); + + for(auto i = std::make_pair(indices.begin(), m_sizes.begin()); i.first != indices.end(); + i.first++, i.second++) + if(*i.first >= *i.second) + throw std::runtime_error("Index out of bounds."); + + return std::inner_product(indices.begin(), indices.end(), m_strides.begin(), m_offset); + } + + template + inline size_t TensorDescriptor::index(Ts... is) const + { + return this->index({is...}); + } + + inline bool TensorDescriptor::incrementCoord(std::vector& coord, + size_t firstDimension) const + { + if(coord.size() != dimensions()) + throw std::runtime_error(concatenate( + "Invalid coordinate size ", coord.size(), " for ", dimensions(), "-tensor")); + + if(firstDimension >= dimensions()) + return false; + + return IncrementCoord( + coord.begin() + firstDimension, coord.end(), m_sizes.begin(), m_sizes.end()); + } + + inline TensorDescriptor TensorDescriptor::withNormalizedDimensions() const + { + auto dims = iota(0, dimensions()).to(); + + std::ranges::sort(dims, [this](size_t a, size_t b) { return m_strides[a] < m_strides[b]; }); + + std::vector sizes, strides; + sizes.reserve(dimensions()); + strides.reserve(dimensions()); + + for(auto dim : dims) + { + sizes.push_back(m_sizes[dim]); + strides.push_back(m_strides[dim]); + } + + return TensorDescriptor(m_dataType, std::move(sizes), std::move(strides)); + } + + template + inline void setCommandTensorArg(rocRoller::CommandArguments& commandArgs, + rocRoller::Operations::OperationTag const& tag, + TensorDescriptor& desc, + T value) + { + commandArgs.setArgument(tag, ArgumentType::Value, value); + commandArgs.setArgument(tag, ArgumentType::Limit, desc.totalLogicalElements()); + + auto const& sizes = desc.sizes(); + for(size_t i = 0; i < sizes.size(); i++) + commandArgs.setArgument(tag, ArgumentType::Size, i, sizes[i]); + + auto const& strides = desc.strides(); + for(size_t i = 0; i < strides.size(); i++) + commandArgs.setArgument(tag, ArgumentType::Stride, i, (size_t)strides[i]); + } + + template + std::string writeTensor(std::vector const& data, TensorDescriptor desc) + { + std::string rv = desc.toString() + "\n"; + + auto const& sizes = desc.sizes(); + auto count = CoordCount(sizes.begin(), std::prev(sizes.end())); + + std::vector prevCoord(desc.dimensions(), 0); + for(size_t coordNum = 0; coordNum < count; coordNum++) + { + std::vector coord(desc.dimensions(), 0); + CoordNumbered(coordNum, + coord.begin(), + std::prev(coord.end()), + sizes.begin(), + std::prev(sizes.end())); + + for(coord.back() = 0; coord.back() < sizes.back(); coord.back()++) + { + auto idx = desc.index(coord); + + if(coord.back() > 0) + rv += " "; + else + rv += fmt::format("{: >8}| ", idx); + + rv += fmt::format("{: >8}", data.at(idx)); + } + + rv += "\n"; + + bool newDim = false; + for(int idx = 0; idx < desc.dimensions(); idx++) + { + if(coord[idx] < prevCoord[idx]) + { + newDim = true; + break; + } + } + + if(newDim) + { + rv += "\n"; + bool first = true; + for(auto dim : coord) + { + if(!first) + rv += ", "; + rv += std::to_string(dim); + first = false; + } + } + + prevCoord = std::move(coord); + } + + return rv; + } + + template + inline std::vector shuffleDims(std::vector const& input, + TensorDescriptor const& dst, + TensorDescriptor const& src) + { + AssertFatal(dst.dimensions() > 1, ShowValue(dst.dimensions())); + AssertFatal(dst.sizes() == src.sizes(), ShowValue(dst.sizes()), ShowValue(src.sizes())); + AssertFatal(dst.dataType() == src.dataType()); + + auto const& sizes = dst.sizes(); + + std::vector rv(input.size()); + + auto count = CoordCount(sizes.begin(), std::prev(sizes.end())); +#pragma omp parallel for + for(size_t coordNum = 0; coordNum < count; coordNum++) + { + std::vector coord(dst.dimensions(), 0); + CoordNumbered(coordNum, + coord.begin(), + std::prev(coord.end()), + sizes.begin(), + std::prev(sizes.end())); + + for(coord.back() = 0; coord.back() < sizes.back(); coord.back()++) + { + auto dstIdx = dst.index(coord); + auto srcIdx = src.index(coord); + + rv.at(dstIdx) = input.at(srcIdx); + } + } + + return rv; + } +} diff --git a/shared/rocroller/lib/source/CMakeLists.txt b/shared/rocroller/lib/source/CMakeLists.txt index e8bce2fc109..275b7985736 100644 --- a/shared/rocroller/lib/source/CMakeLists.txt +++ b/shared/rocroller/lib/source/CMakeLists.txt @@ -38,6 +38,7 @@ target_sources(rocroller "${CMAKE_CURRENT_SOURCE_DIR}/ReplaceKernelArgs.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/ScheduledInstructions.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/Scheduler.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/TensorDescriptor.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/Timer.cpp" ) diff --git a/shared/rocroller/lib/source/CommandSolution.cpp b/shared/rocroller/lib/source/CommandSolution.cpp index 50d2eed23f0..3474aefbb16 100644 --- a/shared/rocroller/lib/source/CommandSolution.cpp +++ b/shared/rocroller/lib/source/CommandSolution.cpp @@ -318,7 +318,7 @@ namespace rocRoller if(!m_context->kernelOptions()->lazyAddArguments) m_context->kernel()->addCommandArguments(m_command->getArguments()); - auto kernelGraph = KernelGraph::translate(m_command); + auto kernelGraph = KernelGraph::translate(m_command, m_commandParameters); if(Settings::getInstance()->get(Settings::LogGraphs)) Log::debug("CommandKernel::generateKernel: post translate: {}", diff --git a/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp b/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp index e13018b66f7..8da333db67e 100644 --- a/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp +++ b/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -656,23 +657,62 @@ namespace rocRoller = std::visit(getBlockParams, *bSource); // contraction dims are {1} and {0}, which is matrix multiplication - auto TC = m_graph.control.addElement(contraction); + auto TC = m_graph.control.addElement(NOP{}); m_op[mul.getTag()] = TC; m_graph.mapper.connect(TC, D, NaryArgument::DEST); std::vector sourceDims; auto connectBlockScale = [&](Operations::BlockScale const& op, + Operations::OperationTag inputTag, NaryArgument valueArg, - NaryArgument scaleArg) { + NaryArgument scaleArg) -> std::vector { auto mode = op.scaleMode(); AssertFatal(mode != Operations::ScaleMode::Inline, ShowValue(mode)); + auto scaleInput = op.scale().value(); + auto scaleInputOp = m_command->findTag(scaleInput); + AssertFatal(scaleInputOp != nullptr); + + using TensorAndTranspose + = std::tuple>; + auto getTensorAndTranspose = rocRoller::overloaded{ + [](Operations::SubTileTranspose const& op) -> TensorAndTranspose { + return {op.input(), op.tileDimensions()}; + }, + [](Operations::Nop const& op) -> TensorAndTranspose { + return {Operations::OperationTag(), {}}; + }, + [](auto const& op) -> TensorAndTranspose { + return {op.getTag(), {}}; + }}; + + auto [scaleTensor, scaleTranspose] + = std::visit(getTensorAndTranspose, *scaleInputOp); + + if(!scaleTranspose.empty()) + { + AssertFatal(m_params); + auto info = m_params->getDimensionInfo().at(scaleTensor); + auto tile + = std::get(info); + size_t miKScale = tile.miTileSizes.at(2); + + std::vector expectedTile{64, 4, miKScale}; + + AssertFatal(scaleTranspose == expectedTile, + ShowValue(scaleTranspose), + ShowValue(expectedTile), + ShowValue(valueArg), + ShowValue(tile), + ShowValue(tile.miTileSizes)); + } + auto X = m_dim.at(op.data()); - auto XScale = m_dim.at(*op.scale()); + auto XScale = m_dim.at(scaleTensor); auto loadX = m_op.at(op.data()); - auto loadXScale = m_op.at(*op.scale()); + auto loadXScale = m_op.at(scaleTensor); m_graph.control.addElement(Sequence(), {loadX}, {TC}); m_graph.control.addElement(Sequence(), {loadXScale}, {TC}); @@ -681,49 +721,52 @@ namespace rocRoller m_graph.mapper.connect(TC, XScale, scaleArg); sourceDims.insert(sourceDims.end(), {X, XScale}); - }; - - // Handle A, either T_Load_Tiled or BlockScale - std::visit( - rocRoller::overloaded{ - [&](auto const& op, auto, auto) { Throw("Can't go here!"); }, - - [&](Operations::T_Load_Tiled const& op, NaryArgument valueArg, auto) { - // This is difficult to make common with the B version since it needs to directly access mul.a or mul.b. - auto A = m_dim.at(mul.a); - auto loadA = m_op.at(mul.a); - sourceDims.push_back(A); + return scaleTranspose; + }; - m_graph.control.addElement(Sequence(), {loadA}, {TC}); + auto handleInput = rocRoller::overloaded{ + [&](auto const& op, auto, auto, auto) -> std::vector { + Throw("Can't go here!"); + return {}; + }, - m_graph.mapper.connect(TC, A, valueArg); - }, - connectBlockScale}, - *aSource, - singleVariant(NaryArgument::LHS), - singleVariant(NaryArgument::LHS_SCALE)); + [&](Operations::T_Load_Tiled const& op, + Operations::OperationTag inputTag, + NaryArgument valueArg, + auto) -> std::vector { + auto AB = m_dim.at(inputTag); + auto loadAB = m_op.at(inputTag); - std::visit( - rocRoller::overloaded{ - [&](auto const& op, auto, auto) { Throw("Can't go here!"); }, + sourceDims.push_back(AB); - [&](Operations::T_Load_Tiled const& op, NaryArgument valueArg, auto) { - auto B = m_dim.at(mul.b); - auto loadB = m_op.at(mul.b); + m_graph.control.addElement(Sequence(), {loadAB}, {TC}); - sourceDims.push_back(B); + m_graph.mapper.connect(TC, AB, valueArg); - m_graph.control.addElement(Sequence(), {loadB}, {TC}); + return {}; + }, + connectBlockScale}; - m_graph.mapper.connect(TC, B, valueArg); - }, - connectBlockScale}, - *bSource, - singleVariant(NaryArgument::RHS), - singleVariant(NaryArgument::RHS_SCALE)); + // Handle A, either T_Load_Tiled or BlockScale + contraction.scalePreShuffledTileA + = std::visit(handleInput, + *aSource, + singleVariant(mul.a), + singleVariant(NaryArgument::LHS), + singleVariant(NaryArgument::LHS_SCALE)); + + contraction.scalePreShuffledTileB + = std::visit(handleInput, + *bSource, + singleVariant(mul.b), + singleVariant(NaryArgument::RHS), + singleVariant(NaryArgument::RHS_SCALE)); m_graph.coordinates.addElement(DataFlow(), sourceDims, std::vector{D}); + + // Replace contraction in graph after we have filled all the fields. + m_graph.control.setElement(TC, std::move(contraction)); } void operator()(Operations::BlockScale const& t) @@ -732,6 +775,8 @@ namespace rocRoller "ScaleMode::Inline not supported yet."); } + void operator()(Operations::SubTileTranspose const& t) {} + void operator()(Operations::Literal const& literal) { rocRoller::Log::getLogger()->debug("KernelGraph::TranslateVisitor::Literal"); @@ -745,9 +790,10 @@ namespace rocRoller void operator()(Operations::Tensor const& t) {} void operator()(Operations::Scalar const& t) {} - KernelGraph call(CommandPtr command) + KernelGraph call(CommandPtr command, CommandParametersPtr params) { m_command = command; + m_params = params; for(auto const& op : command->operations()) { std::visit(*this, *op); @@ -770,16 +816,17 @@ namespace rocRoller // command tag -> dimension/coordinate tag std::map m_dim; - CommandPtr m_command; + CommandPtr m_command; + CommandParametersPtr m_params; }; - KernelGraph translate(CommandPtr command) + KernelGraph translate(CommandPtr command, CommandParametersPtr params) { TIMER(t, "KernelGraph::translate"); rocRoller::Log::getLogger()->debug("KernelGraph::translate(); Command\n{}", command->toString()); TranslateVisitor visitor; - return visitor.call(command); + return visitor.call(command, params); } } } diff --git a/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp b/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp index 4ae26952015..d0a1f6d50cc 100644 --- a/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp +++ b/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -459,6 +460,30 @@ namespace rocRoller auto scaleModeA = getScaleMode(info.loadAScale); auto scaleModeB = getScaleMode(info.loadBScale); + { + auto expectedSkipValue = false; + + auto contraction = graph.control.get(tag).value(); + + if(!contraction.scalePreShuffledTileA.empty()) + { + AssertFatal( + scaleModeA == Operations::ScaleMode::Separate + && scaleModeB == Operations::ScaleMode::Separate + && !contraction.scalePreShuffledTileB.empty(), + "Pre-swizzled inputs must currently be for both A and B or neither.", + ShowValue(scaleModeA), + ShowValue(scaleModeB), + ShowValue(contraction.scalePreShuffledTileB)); + + expectedSkipValue = true; + } + + AssertFatal(context->kernelOptions()->scaleSkipPermlane == expectedSkipValue, + ShowValue(context->kernelOptions()->scaleSkipPermlane), + ShowValue(expectedSkipValue)); + } + auto accumulationCoordSize = getAccumulationLoopSize(graph, a, info.userA); auto [K, forK] = rangeFor(graph, accumulationCoordSize, rocRoller::KLOOP); diff --git a/shared/rocroller/lib/source/Operations/BlockScale.cpp b/shared/rocroller/lib/source/Operations/BlockScale.cpp index baa915f3117..6e64f3cae90 100644 --- a/shared/rocroller/lib/source/Operations/BlockScale.cpp +++ b/shared/rocroller/lib/source/Operations/BlockScale.cpp @@ -35,21 +35,26 @@ namespace rocRoller BlockScale::BlockScale(OperationTag data, int dimensions, std::optional scale, - std::vector const& strides) + std::vector strides) : BaseOperation() , m_data(data) , m_scale(scale) - , m_strides([&]() { - if(dimensions >= 1) - { - std::vector rt(dimensions, 1); - rt[0] = 32; // Default value for first stride based on hardware arch - std::copy(strides.begin(), strides.end(), rt.begin()); - return rt; - } - return std::vector{}; - }()) { + if(!strides.empty()) + { + m_strides = std::move(strides); + } + + if(m_strides.empty()) + { + // Default value for first stride based on hardware arch + m_strides.push_back(32); + } + + if(m_strides.size() != dimensions) + { + m_strides.resize(dimensions, 1); + } } std::unordered_set BlockScale::getInputs() const @@ -125,5 +130,42 @@ namespace rocRoller { return stream << toString(mode); } + + SubTileTranspose::SubTileTranspose(OperationTag input, std::vector tileDimensions) + : m_input(input) + , m_tileDimensions(std::move(tileDimensions)) + { + if(!m_tileDimensions.empty()) + { + AssertFatal(m_tileDimensions.size() == 3, ShowValue(m_tileDimensions)); + AssertFatal(m_tileDimensions[0] == 64 && m_tileDimensions[1] == 4 + && (m_tileDimensions[2] == 2 || m_tileDimensions[2] == 4), + ShowValue(m_tileDimensions)); + } + } + + std::unordered_set SubTileTranspose::getInputs() const + { + return {m_input}; + } + std::string SubTileTranspose::toString() const + { + return fmt::format( + "SubTileTranspose({}: input {})", concatenate(m_tileDimensions), m_input.value); + } + const std::vector& SubTileTranspose::tileDimensions() const + { + return m_tileDimensions; + } + + bool SubTileTranspose::operator==(SubTileTranspose const& other) const + { + return (*this <=> other) == std::strong_ordering::equal; + } + + OperationTag SubTileTranspose::input() const + { + return m_input; + } } } diff --git a/shared/rocroller/lib/source/Operations/Operation.cpp b/shared/rocroller/lib/source/Operations/Operation.cpp index f1343a7517b..e83c0badce0 100644 --- a/shared/rocroller/lib/source/Operations/Operation.cpp +++ b/shared/rocroller/lib/source/Operations/Operation.cpp @@ -54,5 +54,10 @@ namespace rocRoller m_tag = tag; } + std::strong_ordering BaseOperation::operator<=>(BaseOperation const& other) const + { + return m_tag <=> other.m_tag; + } + } } diff --git a/shared/rocroller/lib/source/TensorDescriptor.cpp b/shared/rocroller/lib/source/TensorDescriptor.cpp new file mode 100644 index 00000000000..42e7cf12815 --- /dev/null +++ b/shared/rocroller/lib/source/TensorDescriptor.cpp @@ -0,0 +1,76 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +namespace rocRoller +{ + int64_t TensorDescriptor::dimensionPadding(size_t dim) const + { + AssertFatal(dim < dimensions(), ShowValue(dim), ShowValue(dimensions())); + + if(dim == 0) + return m_strides[0] - 1; + + return m_strides[dim] - (m_strides[dim - 1] * m_sizes[dim - 1]); + } + + void TensorDescriptor::collapseDims(size_t begin, size_t end) + { + AssertFatal(end >= begin, ShowValue(begin), ShowValue(end)); + AssertFatal(begin < dimensions(), ShowValue(begin), ShowValue(dimensions())); + AssertFatal(end <= dimensions(), ShowValue(begin), ShowValue(dimensions())); + + if(end <= (begin + 1)) + return; + + for(size_t i = begin + 1; i < end; i++) + AssertFatal(dimensionPadding(i) == 0, ShowValue(i), ShowValue(dimensionPadding(i))); + + size_t newDimensionSize = 1; + for(size_t i = begin; i < end; i++) + newDimensionSize *= m_sizes[i]; + + m_sizes.erase(m_sizes.begin() + (begin + 1), m_sizes.begin() + end); + m_sizes[begin] = newDimensionSize; + + m_strides.erase(m_strides.begin() + (begin + 1), m_strides.begin() + end); + + calculate(); + } + + std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t) + { + return stream << t.toString(); + } + +} \ No newline at end of file diff --git a/shared/rocroller/scripts/lib/rrperf/rrsuites.py b/shared/rocroller/scripts/lib/rrperf/rrsuites.py index 9201e224085..10712d4d615 100644 --- a/shared/rocroller/scripts/lib/rrperf/rrsuites.py +++ b/shared/rocroller/scripts/lib/rrperf/rrsuites.py @@ -1334,9 +1334,16 @@ def fp4_16x16x128_scale_options(): yield from addSkipPermlane(fp4_target_d2lds_mi16x16x128_pf4x1_wgm()) +def fp4_32x32x64_scale_options(): + yield from fp4_target_d2lds_mi16x16x128_pf4x1_wgm() + yield from addSkipPermlane(fp4_target_d2lds_mi32x32x64_pf4x1_wgm()) + + def fp4_kernels(): yield from fp4_kernels_no_wgm() yield from fp4_kernels_wgm() + yield from fp4_16x16x128_scale_options() + yield from fp4_32x32x64_scale_options() def fp4_target_sweep_wgms(): diff --git a/shared/rocroller/test/catch/CMakeLists.txt b/shared/rocroller/test/catch/CMakeLists.txt index cdcd45529d0..8b4d1dd262c 100644 --- a/shared/rocroller/test/catch/CMakeLists.txt +++ b/shared/rocroller/test/catch/CMakeLists.txt @@ -74,6 +74,7 @@ target_sources(rocroller-tests-catch "${CMAKE_CURRENT_SOURCE_DIR}/ScaleUtilsTest.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/SettingsTest.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/SubDwordExpressionTest.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/TensorDescriptorTest.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/TernaryExpressionTest.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/TestContext.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/TestContextTest.cpp" diff --git a/shared/rocroller/test/catch/TensorDescriptorTest.cpp b/shared/rocroller/test/catch/TensorDescriptorTest.cpp new file mode 100644 index 00000000000..0ce5faa77ff --- /dev/null +++ b/shared/rocroller/test/catch/TensorDescriptorTest.cpp @@ -0,0 +1,333 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace rocRoller; + +TEST_CASE("TensorDescriptor basic functionality", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {11, 13, 17}); + + CHECK(t.dimensions() == 3); + CHECK(t.sizes() == std::vector({11, 13, 17})); + CHECK(t.strides() == std::vector({1, 11, 11 * 13})); + + CHECK(t.totalLogicalElements() == 11 * 13 * 17); + CHECK(t.totalAllocatedElements() == 11 * 13 * 17); + CHECK(t.totalAllocatedBytes() == 11 * 13 * 17 * 4); + + for(int i = 0; i < 3; i++) + CHECK(t.dimensionPadding(i) == 0); + + CHECK(t.index(3, 4, 1) == 3 + 4 * 11 + 11 * 13); +} + +TEST_CASE("TensorDescriptor works with overlapping strides", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {4, 6, 3}, {1, 4, 1}); + + CHECK(t.dimensions() == 3); + CHECK(t.sizes() == std::vector({4, 6, 3})); + CHECK(t.strides() == std::vector({1, 4, 1})); + + CHECK(t.dimensionPadding(0) == 0); + CHECK(t.dimensionPadding(1) == 0); + + CHECK(t.totalLogicalElements() == 4 * 6 * 3); + CHECK(t.totalAllocatedElements() == 4 * 6 + (3 - 1)); + CHECK(t.totalAllocatedBytes() == (4 * 6 + (3 - 1)) * 4); +} + +TEST_CASE("TensorDescriptor works with padding", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {11, 13, 17, 4}, {1, 16, 16 * 13, 16 * 13 * 17}); + + CHECK(t.dimensions() == 4); + CHECK(t.sizes() == std::vector({11, 13, 17, 4})); + CHECK(t.strides() == std::vector({1, 16, 16 * 13, 16 * 13 * 17})); + + CHECK(t.totalLogicalElements() == 11 * 13 * 17 * 4); + CHECK(t.totalAllocatedElements() + == 1 + 1 * (11 - 1) + 16 * (13 - 1) + (16 * 13) * (17 - 1) + (16 * 13 * 17) * (4 - 1)); + + CHECK(t.dimensionPadding(0) == 0); + CHECK(t.dimensionPadding(1) == 5); + CHECK(t.dimensionPadding(2) == 0); + CHECK(t.dimensionPadding(3) == 0); + + CHECK(t.index(3, 4, 1, 2) == 3 + 4 * 16 + 16 * 13 + 16 * 13 * 17 * 2); +} + +TEST_CASE("TensorDescriptor works with simplified padding", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {4, 5}, {1, 8}); + + CHECK(t.dimensions() == 2); + CHECK(t.sizes() == std::vector({4, 5})); + CHECK(t.strides() == std::vector({1, 8})); // default 1,4 + + CHECK(t.dimensionPadding(0) == 0); + CHECK(t.dimensionPadding(1) == 4); + + CHECK(t.totalLogicalElements() == 4 * 5); + CHECK(t.totalAllocatedElements() == 4 + 8 * (5 - 1)); + CHECK(t.totalAllocatedBytes() == (4 + 8 * (5 - 1)) * 4); +} + +TEST_CASE("TensorDescriptor works with zero strides", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, + {4, 5, 6}, + {TensorDescriptor::UseDefaultStride, TensorDescriptor::UseDefaultStride, 0}); + + CHECK(t.dimensions() == 3); + CHECK(t.sizes() == std::vector({4, 5, 6})); + CHECK(t.strides() == std::vector({1, 4, 0})); // default 1,4 + + CHECK(t.dimensionPadding(0) == 0); + CHECK(t.dimensionPadding(1) == 0); + CHECK(t.dimensionPadding(2) == -20); + + CHECK(t.totalLogicalElements() == 4 * 5 * 6); + CHECK(t.totalAllocatedElements() == 4 * 5); + CHECK(t.totalAllocatedBytes() == 4 * 5 * 4); +} + +TEST_CASE("TensorDescriptor::CollapseDims works", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {11, 13, 17, 4}, {1, 16, 16 * 13, 16 * 13 * 17}); + + TensorDescriptor u = t; + CHECK_THROWS_AS(u.collapseDims(0, 2), std::runtime_error); + + u.collapseDims(1, 3); + + CHECK(u.dimensions() == 3); + CHECK(u.sizes() == std::vector({11, 13 * 17, 4})); + CHECK(u.strides() == std::vector({1, 16, 16 * 13 * 17})); + + CHECK(u.totalLogicalElements() == t.totalLogicalElements()); + CHECK(u.totalAllocatedElements() == t.totalAllocatedElements()); + CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes()); +} + +TEST_CASE("TensorDescriptor::CollapseDims works part 2", "[utils][tensor-descriptor]") +{ + TensorDescriptor t(DataType::Float, {11, 13, 17, 4}); + + SECTION("0,2") + { + TensorDescriptor u = t; + u.collapseDims(0, 2); + + CHECK(u.dimensions() == 3); + CHECK(u.sizes() == std::vector({11 * 13, 17, 4})); + CHECK(u.strides() == std::vector({1, 11 * 13, 11 * 13 * 17})); + + CHECK(u.totalLogicalElements() == t.totalLogicalElements()); + CHECK(u.totalAllocatedElements() == t.totalAllocatedElements()); + CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes()); + } + + SECTION("0,4") + { + TensorDescriptor u = t; + u.collapseDims(0, 4); + + CHECK(u.dimensions() == 1); + CHECK(u.sizes() == std::vector({11 * 13 * 17 * 4})); + CHECK(u.strides() == std::vector({1})); + + CHECK(u.totalLogicalElements() == t.totalLogicalElements()); + CHECK(u.totalAllocatedElements() == t.totalAllocatedElements()); + CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes()); + } + + SECTION("1,4") + { + TensorDescriptor u = t; + u.collapseDims(1, 4); + + CHECK(u.dimensions() == 2); + CHECK(u.sizes() == std::vector({11, 13 * 17 * 4})); + CHECK(u.strides() == std::vector({1, 11})); + + CHECK(u.totalLogicalElements() == t.totalLogicalElements()); + CHECK(u.totalAllocatedElements() == t.totalAllocatedElements()); + CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes()); + } + + SECTION("1,3") + { + TensorDescriptor u = t; + u.collapseDims(1, 3); + + CHECK(u.dimensions() == 3); + CHECK(u.sizes() == std::vector({11, 13 * 17, 4})); + CHECK(u.strides() == std::vector({1, 11, 11 * 13 * 17})); + + CHECK(u.totalLogicalElements() == t.totalLogicalElements()); + CHECK(u.totalAllocatedElements() == t.totalAllocatedElements()); + CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes()); + } +} + +TEST_CASE("IncrementCoord works for 2 dimensions", "[utils][tensor-descriptor]") +{ + std::vector dims{2, 4}; + std::vector lastCoord{1, 3}; + std::vector coordRef(2); + std::vector coordRun(2); + + for(coordRef[1] = 0; coordRef[1] < dims[1]; coordRef[1]++) + for(coordRef[0] = 0; coordRef[0] < dims[0]; coordRef[0]++) + { + CHECK(coordRun == coordRef); + + bool continueIteration + = IncrementCoord(coordRun.begin(), coordRun.end(), dims.begin(), dims.end()); + if(coordRef == lastCoord) + CHECK(continueIteration == false); + else + CHECK(continueIteration == true); + } + + coordRef = {0, 0}; + CHECK(coordRun == coordRef); + + CHECK(IncrementCoord(coordRun.begin(), coordRun.end(), dims.begin(), dims.end())); +} + +TEST_CASE("Default strides work for TensorDescriptor.", "[utils][tensor-descriptor]") +{ + TensorDescriptor desc(DataType::Float, {4, 5, 6}, {static_cast(-1), 5}); + CHECK(desc.dimensions() == 3); + CHECK(desc.sizes() == std::vector({4, 5, 6})); + CHECK(desc.strides() == std::vector({1, 5, 25})); +} + +TEST_CASE("Specifying a subset of strides works for TensorDescriptor.", + "[utils][tensor-descriptor]") +{ + TensorDescriptor desc(DataType::Float, {4, 5, 6}, {5}); + CHECK(desc.dimensions() == 3); + CHECK(desc.sizes() == std::vector({4, 5, 6})); + CHECK(desc.strides() == std::vector({5, 20, 100})); +} + +TEST_CASE("ShuffleDims works in the no-op case", "[utils][tensor-descriptor]") +{ + auto dims = GENERATE(Catch::Generators::range(2, 10)); + DYNAMIC_SECTION(fmt::format("dims={}", dims)) + { + auto seed = GENERATE(Catch::Generators::take( + 4, Catch::Generators::random(0, std::numeric_limits::max()))); + + DYNAMIC_SECTION(fmt::format("seed={}", seed)) + { + auto sizes = RandomGenerator(seed).vector(dims, 1, 12); + CAPTURE(sizes); + + TensorDescriptor desc(DataType::Int32, sizes); + + auto input = iota(0, desc.totalAllocatedElements()).to(); + auto output = shuffleDims(input, desc, desc); + + CHECK(input == output); + } + } +} + +TEST_CASE("ShuffleDims is reversible", "[utils][tensor-descriptor]") +{ + auto dims = GENERATE(Catch::Generators::range(2, 10)); + DYNAMIC_SECTION(fmt::format("dims={}", dims)) + { + auto seed = GENERATE(Catch::Generators::take( + 4, Catch::Generators::random(0, std::numeric_limits::max()))); + + DYNAMIC_SECTION(fmt::format("seed={}", seed)) + { + auto gen = RandomGenerator(seed); + auto sizes = gen.vector(dims, 2, 12); + CAPTURE(sizes); + + TensorDescriptor src(DataType::Int32, sizes); + + std::vector order; + { + auto indices = iota(0, dims).to(); + while(!indices.empty()) + { + auto idx = gen.next(0, indices.size() - 1); + REQUIRE(idx < dims); + order.push_back(indices.at(idx)); + indices.erase(next(indices.begin(), idx)); + } + } + + auto dst = TensorDescriptor::ShuffledNoPadding(DataType::Int32, sizes, order); + + auto numNonUnitSizes = std::ranges::count_if(sizes, [](auto x) { return x > 1; }); + + CAPTURE(src.strides()); + CAPTURE(dst.strides()); + + auto input = iota(0, src.totalAllocatedElements()).to(); + + SECTION("a -> b -> a") + { + auto intermediate = shuffleDims(input, dst, src); + CAPTURE(intermediate); + if(numNonUnitSizes > 1 && src.strides() != dst.strides()) + CHECK(input != intermediate); + auto output = shuffleDims(intermediate, src, dst); + REQUIRE(input == output); + } + + SECTION("b -> a -> b") + { + auto intermediate = shuffleDims(input, src, dst); + CAPTURE(intermediate); + if(numNonUnitSizes > 1 && src.strides() != dst.strides()) + CHECK(input != intermediate); + auto output = shuffleDims(intermediate, dst, src); + REQUIRE(input == output); + } + } + } +} \ No newline at end of file diff --git a/shared/rocroller/test/common/include/common/mxDataGen.hpp b/shared/rocroller/test/common/include/common/mxDataGen.hpp index 9fd5090b1a8..51fcc1127a4 100644 --- a/shared/rocroller/test/common/include/common/mxDataGen.hpp +++ b/shared/rocroller/test/common/include/common/mxDataGen.hpp @@ -94,12 +94,12 @@ namespace rocRoller template DGen::DataGenerator::type> - getDataGenerator(TensorDescriptor& desc, - const float min, - const float max, - const uint32_t seed, - const index_t blockScaling = 1, - const DataPattern pattern = Bounded) + getDataGenerator(TensorDescriptor const& desc, + const float min, + const float max, + const uint32_t seed, + const index_t blockScaling = 1, + const DataPattern pattern = Bounded) { auto sizes = desc.sizes(); auto strides = desc.strides(); @@ -175,20 +175,20 @@ namespace rocRoller } template - void DGenInput(const uint32_t seed, - std::vector& hostA, - TensorDescriptor& descA, - std::vector& hostB, - TensorDescriptor& descB, - std::vector& hostC, - TensorDescriptor& descC, - std::vector& hostScaleA, - std::vector& hostScaleB, - bool hasScaleA = false, - bool hasScaleB = false, - float min = -1.f, - float max = 1.f, - const uint scaleBlockSize = 32 + void DGenInput(const uint32_t seed, + std::vector& hostA, + TensorDescriptor const& descA, + std::vector& hostB, + TensorDescriptor const& descB, + std::vector& hostC, + TensorDescriptor const& descC, + std::vector& hostScaleA, + std::vector& hostScaleB, + bool hasScaleA = false, + bool hasScaleB = false, + float min = -1.f, + float max = 1.f, + const uint scaleBlockSize = 32 ) { diff --git a/shared/rocroller/test/unit/PermLanesTest.cpp b/shared/rocroller/test/unit/PermLanesTest.cpp index f34c1c6a437..8e47ae145c5 100644 --- a/shared/rocroller/test/unit/PermLanesTest.cpp +++ b/shared/rocroller/test/unit/PermLanesTest.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include "GPUContextFixture.hpp" @@ -160,25 +161,50 @@ namespace PermLanesTest result.data(), dResult.get(), result.size() * sizeof(uint8_t), hipMemcpyDefault), HasHipSuccess(0)); - auto nWaves = 4; - auto factor = waveK / miK; - auto nLanes = 16; - for(int wave = 0; wave < nWaves; wave++) - for(int simdBlock = 0; simdBlock < miK; simdBlock++) - for(int simdIndex = 0; simdIndex < factor; simdIndex++) - for(int lane = 0; lane < nLanes; lane++) - for(int vgprBlock = 0; vgprBlock < factor; vgprBlock++) - for(int vgprIndex = 0; vgprIndex < miK; vgprIndex++) - { - ASSERT_EQ(a[wave * waveK * nLanes * waveK - + simdBlock * factor * nLanes * waveK - + simdIndex * nLanes * waveK + lane * waveK - + vgprBlock * miK + vgprIndex], - result[wave * waveK * nLanes * waveK - + vgprIndex * factor * nLanes * waveK - + simdIndex * nLanes * waveK + lane * waveK - + vgprBlock * miK + simdBlock]); - } + int nWaves = 4; + int factor = waveK / miK; + int nLanes = 16; + + // clang-format off + for(int wave = 0; wave < nWaves; wave++) + for(int simdBlock = 0; simdBlock < miK; simdBlock++) + for(int simdIndex = 0; simdIndex < factor; simdIndex++) + for(int lane = 0; lane < nLanes; lane++) + for(int vgprBlock = 0; vgprBlock < factor; vgprBlock++) + for(int vgprIndex = 0; vgprIndex < miK; vgprIndex++) + { + auto aIdx = wave * waveK * nLanes * waveK + + simdBlock * factor * nLanes * waveK + + simdIndex * nLanes * waveK + + lane * waveK + + vgprBlock * miK + + vgprIndex; + + auto resultIdx = wave * waveK * nLanes * waveK + + vgprIndex * factor * nLanes * waveK + + simdIndex * nLanes * waveK + + lane * waveK + + vgprBlock * miK + + simdBlock; + + ASSERT_EQ(a[aIdx], result[resultIdx]); + } + // clang-format on + + std::vector sizes = {static_cast(miK), + static_cast(factor), + static_cast(nLanes), + static_cast(factor), + static_cast(miK), + static_cast(nWaves)}; + + auto order = {4, 1, 2, 3, 0, 5}; + + TensorDescriptor src(DataType::E8M0, sizes); + auto dst = TensorDescriptor::ShuffledNoPadding(DataType::E8M0, sizes, {4, 1, 2, 3, 0, 5}); + + auto a_reordered = shuffleDims(a, dst, src); + EXPECT_EQ(a_reordered, result); } TEST_F(PermLanesTest, PermLanesBlockScale16x4GPUTest)