ROCm · bnemanich · Dec 9, 2025 · Nov 17, 2025 · Nov 18, 2025 · Nov 19, 2025
@@ -33,7 +33,7 @@
 
 #include <rocRoller/DataTypes/DataTypes.hpp>
 
-#include <origami/utils.hpp>
+#include "origami/types.hpp"
 
 /**
  * @brief Convert rocRoller::Datatype to analytical::DataType

@@ -28,55 +28,52 @@
 #include "gemm.hpp"
 #include "runtime_args_selection.hpp"
 
-#include <origami/streamk.hpp>
-
-const int DEFAULT_DYNAMIC_MODE = 6;
+#include "origami/streamk.hpp"
 
 int chooseStreamKGridSize(std::shared_ptr<GemmKernel>        gemm,
                           const RocblasltContractionProblem& prob)
 {
-    const origami::hardware_t analaytical_hardware = origami::hardware_t::get_hardware_for_device(0);
+    const origami::hardware_t analytical_hardware = origami::hardware_t::get_hardware_for_device(0);
+
+    const origami::grid_selection_t DEFAULT_DYNAMIC_MODE = origami::grid_selection_t::k_split_aware;
+
+    //setting max_cu's
+    size_t max_cus = analytical_hardware.N_CU;
 
     size_t elementSizeA_bits = rocRoller::DataTypeInfo::Get(gemm->params->kernelType.typeA).elementBits;
     size_t elementSizeB_bits = rocRoller::DataTypeInfo::Get(gemm->params->kernelType.typeB).elementBits;
-    size_t elementSizeD_bits = rocRoller::DataTypeInfo::Get(gemm->params->kernelType.typeD).elementBits;
     size_t elementSizeAcc = rocRoller::DataTypeInfo::Get(gemm->params->kernelType.typeAcc).elementBytes;
 
-    origami::data_type_t dataType;
-    if (elementSizeA_bits < elementSizeB_bits)
-        dataType = rocroller_type_to_analytical_type(gemm->params->kernelType.typeB);
-    else
-        dataType = rocroller_type_to_analytical_type(gemm->params->kernelType.typeA);
+    origami::problem_t origami_problem = {
+        .size = {prob.m, prob.n, prob.k},
+        .batch = prob.batch_count,
+        .a_dtype = rocroller_type_to_analytical_type(gemm->params->kernelType.typeA),
+        .b_dtype = rocroller_type_to_analytical_type(gemm->params->kernelType.typeB),
+        .mi_dtype = rocroller_type_to_analytical_type(elementSizeA_bits < elementSizeB_bits ? gemm->params->kernelType.typeB : gemm->params->kernelType.typeA),
+    };
+    origami::config_t origami_config = {
+        .mt = {
+            static_cast<size_t>(gemm->params->workgroupTile.m), 
+            static_cast<size_t>(gemm->params->workgroupTile.n), 
+            static_cast<size_t>(gemm->params->workgroupTile.k)
+        },
+        .occupancy = gemm->occupancy,
+        .workspace_size = prob.workspaceSize,
+        .workspace_size_per_elem_c = elementSizeAcc,
+    };
+
+    auto reduction_type = origami::streamk::select_reduction(origami_problem,
+                                                            analytical_hardware,
+                                                            origami_config,
+                                                            DEFAULT_DYNAMIC_MODE);
 
-    auto reduction_type = origami::streamk::select_reduction(prob.m, prob.n, prob.k, prob.batch_count,
-        gemm->params->workgroupTile.m, gemm->params->workgroupTile.n, gemm->params->workgroupTile.k, analaytical_hardware, DEFAULT_DYNAMIC_MODE);
-    // Override reduction type to tree reduction for now.
-    // When Parallel reduction is available, this line can be removed
-    reduction_type = origami::streamk::reduction_type::Tree;
+    origami_config.reduction_strategy = reduction_type;
 
-    auto result = origami::streamk::select_grid(prob.m,
-        prob.n,
-        prob.k,
-        prob.batch_count,
-        prob.trans_a == HIPBLAS_OP_T,
-        prob.trans_b == HIPBLAS_OP_T,
-        elementSizeA_bits,
-        elementSizeB_bits,
-        elementSizeD_bits,
-        dataType,
-        prob.workspaceSize,
-        gemm->params->workgroupTile.m,
-        gemm->params->workgroupTile.n,
-        gemm->params->workgroupTile.k,
-        gemm->params->machineInstruction.m,
-        gemm->params->machineInstruction.n,
-        gemm->params->machineInstruction.k,
-        DEFAULT_WGM,
-        elementSizeAcc,
-        gemm->occupancy,
-        analaytical_hardware,
-        DEFAULT_DYNAMIC_MODE,
-        reduction_type);
+    auto result = origami::streamk::select_grid_size(origami_problem,
+                                                    analytical_hardware,
+                                                    origami_config,
+                                                    DEFAULT_DYNAMIC_MODE,
+                                                    max_cus);
 
     return result;
 }
@@ -29,7 +29,7 @@
 #include "runtime_args_selection.hpp"
 #include "solution_selection.hpp"
 
-#include <origami/utils.hpp>
+#include "origami/origami.hpp"
 
 const int MAX_BITS_WORKGROUPTILE_M     = 8;
 const int MAX_BITS_WORKGROUPTILE_N     = 8;
@@ -44,7 +44,9 @@ const int USE_WORKGROUP_MAPPING_K_SIZE = 4096;
  * compile-time known.
  */
 
- constexpr std::array<WorkGroupTileSize, 34> possibleTileSizes = {{
+ constexpr size_t possibleTileSizesCount = 34;
+
+ constexpr std::array<WorkGroupTileSize, possibleTileSizesCount> possibleTileSizes = {{
     {256, 256, 128},
     {256, 192, 128},
     {256, 128, 128},
@@ -82,10 +84,10 @@ const int USE_WORKGROUP_MAPPING_K_SIZE = 4096;
 }};
 
 template <rocRoller::DataType typeA, rocRoller::DataType typeB>
-constexpr auto generateTileList() {
-    std::array<origami::tile_tuple, possibleTileSizes.size()> tileList{};
+auto generateTileList() {
+    std::array<origami::config_t, possibleTileSizesCount> tileList{};
 
-    for (size_t i = 0; i < possibleTileSizes.size(); ++i) {
+    for (size_t i = 0; i < possibleTileSizesCount; ++i) {
         const auto& wgt = possibleTileSizes[i];
         auto MI = pickMI(typeA, typeB, wgt);
 
@@ -96,27 +98,33 @@ constexpr auto generateTileList() {
 
         int unroll = preferredUnrolling(typeA, typeB, wgt);
 
-        int non_temporal_a = 0;
-        int non_temporal_b = 0;
-
-        tileList[i] = std::make_tuple(
-            wgt.m, wgt.n, wgtk * unroll,
-            MI.m, MI.n, MI.k,
-            1, // occupancy
-            DEFAULT_WGM,
-            non_temporal_a,
-            non_temporal_b
-        );
+        origami::config_t origami_config = {
+            .mt = {
+                static_cast<size_t>(wgt.m), 
+                static_cast<size_t>(wgt.n), 
+                static_cast<size_t>(wgtk * unroll)
+            },
+            .mi = {
+                static_cast<size_t>(MI.m),
+                static_cast<size_t>(MI.n),
+                static_cast<size_t>(MI.k)
+            },
+            .occupancy = 1,
+            .cache_hints_a = 0,
+            .cache_hints_b = 0,
+        };
+
+        tileList[i] = origami_config;
     }
 
     return tileList;
 }
 
-using TileListGeneratorFn = std::vector<origami::tile_tuple>(*)();
+using TileListGeneratorFn = std::vector<origami::config_t>(*)();
 
 template <rocRoller::DataType A, rocRoller::DataType B>
-std::vector<origami::tile_tuple> generateTileListWrapper() {
-    constexpr auto arr = generateTileList<A, B>();
+std::vector<origami::config_t> generateTileListWrapper() {
+    auto arr = generateTileList<A, B>();
     return {arr.begin(), arr.end()};
 }
 
@@ -144,7 +152,7 @@ const std::map<std::pair<rocRoller::DataType, rocRoller::DataType>, TileListGene
     INSTANTIATE_TILE_LIST_FOR(FP6)
 };
 
-std::vector<origami::tile_tuple> getTileListForKernelType(KernelType kernelType)
+std::vector<origami::config_t> getTileListForKernelType(KernelType kernelType)
 {
     auto key = std::make_pair(kernelType.typeA, kernelType.typeB);
     auto it = tileListGenerators.find(key);
@@ -170,43 +178,42 @@ std::vector<SolutionIndexParameters> chooseSolutionIndexParameters(
 {
     std::vector<SolutionIndexParameters> params;
 
-    std::vector<origami::tile_tuple> tile_list = getTileListForKernelType(kernelType);
+    std::vector<origami::config_t> origami_config_list = getTileListForKernelType(kernelType);
 
     size_t elementSizeA_bits = rocRoller::DataTypeInfo::Get(kernelType.typeA).elementBits;
     size_t elementSizeB_bits = rocRoller::DataTypeInfo::Get(kernelType.typeB).elementBits;
-    size_t elementSizeC_bits = rocRoller::DataTypeInfo::Get(kernelType.typeC).elementBits;
-
-    origami::data_type_t dataType;
-    if (elementSizeA_bits < elementSizeB_bits)
-        dataType = rocroller_type_to_analytical_type(kernelType.typeB);
-    else
-        dataType = rocroller_type_to_analytical_type(kernelType.typeA);
-
-    const origami::hardware_t analaytical_hardware = origami::hardware_t::get_hardware_for_device(0);
-
-    int WGM = std::sqrt(std::floor(analaytical_hardware.N_CU / analaytical_hardware.NUM_XCD));
-
-    auto selected_tiles = origami::select_best_macro_tile_size(
-        prob.m,
-        prob.n,
-        prob.k,
-        prob.batch_count,
-        prob.trans_a == hipblasOperation_t::HIPBLAS_OP_T,
-        prob.trans_b == hipblasOperation_t::HIPBLAS_OP_T,
-        analaytical_hardware,
-        tile_list,
-        elementSizeA_bits,
-        elementSizeB_bits,
-        elementSizeC_bits,
-        dataType,
-        kernelType.scaleABlockRowSize * kernelType.scaleABlockColSize, //Handle A vs B block size.
-        0.8,
-        false,
-        WGM);
-
-    for(auto const& selected_tile : selected_tiles)
+
+    const origami::hardware_t analytical_hardware = origami::hardware_t::get_hardware_for_device(0);
+
+    origami::problem_t origami_problem = {
+        .size = {prob.m, prob.n, prob.k},
+        .batch = prob.batch_count,
+        .a_transpose = (prob.trans_a == hipblasOperation_t::HIPBLAS_OP_T) ? origami::transpose_t::T : origami::transpose_t::N,
+        .b_transpose = (prob.trans_b == hipblasOperation_t::HIPBLAS_OP_T) ? origami::transpose_t::T : origami::transpose_t::N,
+        .a_dtype = rocroller_type_to_analytical_type(kernelType.typeA),
+        .b_dtype = rocroller_type_to_analytical_type(kernelType.typeB),
+        .mi_dtype = rocroller_type_to_analytical_type(elementSizeA_bits < elementSizeB_bits ? kernelType.typeB : kernelType.typeA),
+        .a_mx_block_size = kernelType.scaleABlockRowSize * kernelType.scaleABlockColSize,
+        .b_mx_block_size = kernelType.scaleBBlockRowSize * kernelType.scaleBBlockColSize,
+    };
+
+    int defaultWGM = std::ceil(std::sqrt(analytical_hardware.N_CU / analytical_hardware.NUM_XCD));
+    for (auto& config : origami_config_list) {
+        config.workgroup_mapping = defaultWGM;
+    }
+
+    auto prediction_result = origami::rank_configs(
+        origami_problem,
+        analytical_hardware,
+        origami_config_list
+    );
+
+    for(auto const& result : prediction_result)
     {
-        WorkGroupTileSize wgt{(int)std::get<1>(selected_tile), (int)std::get<2>(selected_tile), (int)std::get<3>(selected_tile)};
+        auto mt_m = static_cast<int>(result.config.mt.m);
+        auto mt_n = static_cast<int>(result.config.mt.n);
+        auto mt_k = static_cast<int>(result.config.mt.k);
+        WorkGroupTileSize wgt{mt_m, mt_n, mt_k};
         int unrollAmount = preferredUnrolling(kernelType.typeA, kernelType.typeB, wgt);
         wgt.k /= unrollAmount;
 

@@ -42,7 +42,8 @@
 #include <Tensile/Task.hpp>
 #include <Tensile/Utils.hpp>
 
-#include <origami/streamk.hpp>
+#include "origami/origami.hpp"
+#include "origami/streamk.hpp"
 
 #define TENSILE_COMMON_KERNEL_ARGS_SIZE 16
 
@@ -166,7 +167,7 @@ namespace TensileLite
 
     struct StreamKSettings
     {
-        origami::streamk::reduction_type reduction = origami::streamk::reduction_type::Tree;
+        origami::reduction_t reduction = origami::reduction_t::tree;
         size_t grid = 0;
     };
 
@@ -183,7 +184,7 @@ namespace TensileLite
         using Problem       = ContractionProblemGemm;
         using Inputs        = ContractionInputs;
         using GroupedInputs = ContractionGroupedInputs;
-        using ParamsCache  = CacheMap<std::pair<int32_t, uint32_t>, Problem>;
+        using ParamsCache  = CacheMap<std::pair<int32_t, int32_t>, Problem>;
 
         /**
          * Indicate a solution is equally or estimatedly matched.
@@ -218,6 +219,11 @@ namespace TensileLite
         }
         virtual bool isFallbackForHW(Hardware const&) const;
 
+        bool isStreamK() const
+        {
+            return sizeMapping.streamK > 0;
+        }
+
         //! Estimates based on problem size, solution tile, and  machine hardware
         //! charz:
         struct StaticPerformanceModel
@@ -290,8 +296,8 @@ namespace TensileLite
         void calculateGrid(dim3& workGroupSize,
                            dim3& numWorkGroups,
                            ContractionSolution::Problem const& problem) const;
-        origami::streamk::reduction_type getSKReduction(Problem const& problem, Hardware const& hardware) const;
-        size_t getSKGrid(Problem const& problem, Hardware const& hardware, size_t tiles, origami::streamk::reduction_type& reductionStrat) const;
+        origami::reduction_t getSKReduction(Problem const& problem, Hardware const& hardware) const;
+        size_t getSKGrid(Problem const& problem, Hardware const& hardware, size_t tiles, origami::reduction_t reductionStrat) const;
         size_t partialTileSize(size_t skGrid) const;
 
         static float computeGranularity(float x);
@@ -566,9 +572,9 @@ namespace TensileLite
         uint32_t magicNumber(int magicDivAlg, uint32_t x, uint32_t* magicShift) const;
         uint32_t smallMagicNumber(uint32_t x) const;
 
-        std::pair<int32_t, uint32_t> calculateAutoWGM(Problem const&  problem, 
-                                                      Hardware const* hardware, 
-                                                      uint32_t        skgrid) const;
+        std::pair<int32_t, int32_t> calculateAutoWGM(Problem const&  problem,
+                                                     Hardware const* hardware,
+                                                     uint32_t        skgrid) const;
         uint32_t calculateAutoGSU(Problem const& problem, Hardware const* hardware) const;
     };