diff --git a/math-libs/BLAS/CMakeLists.txt b/math-libs/BLAS/CMakeLists.txt index 65e78ff2b40..697c3f7b413 100644 --- a/math-libs/BLAS/CMakeLists.txt +++ b/math-libs/BLAS/CMakeLists.txt @@ -119,13 +119,13 @@ therock_cmake_subproject_declare(hipBLASLt EXTERNAL_SOURCE_DIR "${THEROCK_ROCM_LIBRARIES_SOURCE_DIR}/projects/hipblaslt" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipBLASLt" BACKGROUND_BUILD - CMAKE_LISTS_RELPATH "next-cmake" CMAKE_ARGS -DHIP_PLATFORM=amd -DHIPBLASLT_ENABLE_BLIS=OFF # TODO: Evaluate - -DBUILD_TESTING=${THEROCK_BUILD_TESTING} + -DTENSILELITE_BUILD_TESTING=OFF -DHIPBLASLT_BUILD_TESTING=${THEROCK_BUILD_TESTING} -DHIPBLASLT_ENABLE_ROCROLLER=${_enable_rocRoller} + -DHIPBLASLT_ENABLE_FETCH=OFF CMAKE_INCLUDES therock_explicit_finders.cmake COMPILER_TOOLCHAIN diff --git a/math-libs/BLAS/hipSOLVER b/math-libs/BLAS/hipSOLVER index 0ceedc4d5ca..0dc8c0ed596 160000 --- a/math-libs/BLAS/hipSOLVER +++ b/math-libs/BLAS/hipSOLVER @@ -1 +1 @@ -Subproject commit 0ceedc4d5ca332a8bde4ca45959e103aded45522 +Subproject commit 0dc8c0ed596b34805c6d3efe12bbff6aef2cefdc diff --git a/math-libs/BLAS/rocSOLVER b/math-libs/BLAS/rocSOLVER index e4cab8867f5..26baded6a74 160000 --- a/math-libs/BLAS/rocSOLVER +++ b/math-libs/BLAS/rocSOLVER @@ -1 +1 @@ -Subproject commit e4cab8867f5027200c0242ac5a22e7a2e510fb3f +Subproject commit 26baded6a7432828bb2a39daa5fe274c003f40f1 diff --git a/ml-libs/composable_kernel b/ml-libs/composable_kernel index 84a7600bdc5..071165919f1 160000 --- a/ml-libs/composable_kernel +++ b/ml-libs/composable_kernel @@ -1 +1 @@ -Subproject commit 84a7600bdc5cc06123a82e48348820e2dd6c3285 +Subproject commit 071165919f1237bf187e2653437bf51d6cf87a6e diff --git a/patches/amd-mainline/rocm-libraries/0002-Fix-linkage-for-tensilelite-host-target-and-bump-mxd.patch b/patches/amd-mainline/rocm-libraries/0002-Fix-linkage-for-tensilelite-host-target-and-bump-mxd.patch deleted file mode 100644 index 77fd1ee50b9..00000000000 --- a/patches/amd-mainline/rocm-libraries/0002-Fix-linkage-for-tensilelite-host-target-and-bump-mxd.patch +++ /dev/null @@ -1,199 +0,0 @@ -From f6e16aa208345c8004da839f5972ae792611a876 Mon Sep 17 00:00:00 2001 -From: David Dixon -Date: Mon, 28 Jul 2025 13:02:51 +0000 -Subject: [PATCH 02/17] Fix linkage for tensilelite-host target and bump - mxdatagen - -- Updates linkage for tensilelite-host msgpack dependency because it is - a build and usage requirement. -- Add tensilelite-host to rocroller host target - -Adopted from https://github.com/ROCm/rocm-libraries/pull/702. ---- - projects/hipblaslt/clients/CMakeLists.txt | 2 +- - .../hipblaslt/clients/common/mxDataGen.cpp | 16 +++++++------- - .../hipblaslt/clients/include/mxDataGen.hpp | 6 ++--- - projects/hipblaslt/next-cmake/CMakeLists.txt | 22 ++++++++++--------- - .../host-library/src/rocblaslt/CMakeLists.txt | 8 ++++++- - 5 files changed, 31 insertions(+), 23 deletions(-) - -diff --git a/projects/hipblaslt/clients/CMakeLists.txt b/projects/hipblaslt/clients/CMakeLists.txt -index 05e6c98901..b379bd02a3 100755 ---- a/projects/hipblaslt/clients/CMakeLists.txt -+++ b/projects/hipblaslt/clients/CMakeLists.txt -@@ -126,7 +126,7 @@ if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) - FetchContent_Declare( - mxDataGenerator - GIT_REPOSITORY https://github.com/ROCm/mxDataGenerator.git -- GIT_TAG 12c016dc694139317feb2e23c59028fde70beaf4 -+ GIT_TAG 31407efa7938fd70ea9b28e08a4d53e398415f8b - ) - FetchContent_MakeAvailable(mxDataGenerator) - -diff --git a/projects/hipblaslt/clients/common/mxDataGen.cpp b/projects/hipblaslt/clients/common/mxDataGen.cpp -index f3f30ce829..87732a23ae 100644 ---- a/projects/hipblaslt/clients/common/mxDataGen.cpp -+++ b/projects/hipblaslt/clients/common/mxDataGen.cpp -@@ -135,7 +135,7 @@ void packData(std::vector const& dataBytes, uint8_t* packedData) - template - std::vector getAlignedFloat(std::vector& dataBytes, - std::vector const& scaleBytes, -- std::array const sizes, -+ std::array const sizes, - int elementsPerMXBlock, - bool isMatrixA) - { -@@ -215,8 +215,8 @@ template - std::vector generateData(T dgen, - void* data, - void* scale, -- std::vector sizes, -- std::vector strides, -+ std::vector sizes, -+ std::vector strides, - uint32_t seed, - DGen::DataGeneratorOptions& opt, - int elementsPerMXBlock, -@@ -288,9 +288,9 @@ std::vector generateData(T dgen, - std::vector generateMXInput(hipDataType dataType, - void* data, - void* scale, -- int rowSize, -- int colSize, -- int stride, -+ DGen::index_t rowSize, -+ DGen::index_t colSize, -+ DGen::index_t stride, - bool isTranspose, - int const scaleBlockRowSize, - int const scaleBlockColSize, -@@ -309,8 +309,8 @@ std::vector generateMXInput(hipDataType dataType, - - const uint32_t seed = 1713573849; - -- std::vector sizes{rowSize, colSize}; -- std::vector strides; -+ std::vector sizes = {rowSize, colSize}; -+ std::vector strides; - - strides.push_back(1); - strides.push_back(stride); -diff --git a/projects/hipblaslt/clients/include/mxDataGen.hpp b/projects/hipblaslt/clients/include/mxDataGen.hpp -index 766c548d93..6f50adde61 100644 ---- a/projects/hipblaslt/clients/include/mxDataGen.hpp -+++ b/projects/hipblaslt/clients/include/mxDataGen.hpp -@@ -33,9 +33,9 @@ - std::vector generateMXInput(hipDataType dataType, - void* data, - void* scale, -- int row, -- int col, -- int stride, -+ uint64_t row, -+ uint64_t col, -+ uint64_t stride, - bool isTranspose, - int const scaleBlockRowSize, - int const scaleBlockColSize, -diff --git a/projects/hipblaslt/next-cmake/CMakeLists.txt b/projects/hipblaslt/next-cmake/CMakeLists.txt -index d29bdbec32..7a6b0efad8 100644 ---- a/projects/hipblaslt/next-cmake/CMakeLists.txt -+++ b/projects/hipblaslt/next-cmake/CMakeLists.txt -@@ -52,7 +52,7 @@ endif() - - set(HIPBLASLT_ENABLE_DEVICE ON CACHE BOOL "Build hipBLASLt device libraries.") - set(HIPBLASLT_ENABLE_CLIENT ON CACHE BOOL "Build hipBLASLt client apps.") --cmake_dependent_option(HIPBLASLT_ENABLE_HOST "Build hipBLASLt host library." ON "HIPBLASLT_ENABLE_CLIENT" OFF) -+set(HIPBLASLT_ENABLE_HOST ON CACHE BOOL "Build hipBLASLt host library.") - set(TENSILELITE_ENABLE_CLIENT OFF CACHE BOOL "Build the tensilelite client.") - set(TENSILELITE_ENABLE_HOST ON CACHE BOOL "Build the tensilelite host library.") - set(HIPBLASLT_ENABLE_COVERAGE OFF CACHE BOOL "Build gcov support") -@@ -73,7 +73,7 @@ if(HIPBLASLT_ENABLE_HOST OR TENSILELITE_ENABLE_HOST) - set(HIPBLASLT_ENABLE_MSGPACK ON CACHE BOOL "Use msgpack for parsing configuration files.") - set(HIPBLASLT_ENABLE_OPENMP ON CACHE BOOL "Use OpenMP to improve performance.") - set(HIPBLASLT_ENABLE_LLVM OFF CACHE BOOL "Use msgpack for parsing configuration files.") -- set(HIPBLASLT_ENABLE_ROCROLLER OFF CACHE BOOL "Use RocRoller library.") -+ set(HIPBLASLT_ENABLE_ROCROLLER ON CACHE BOOL "Use RocRoller library.") - set(HIPBLASLT_ENABLE_BLIS ON CACHE BOOL "Enable BLIS support.") # I don't know that we can build with this OFF - set(HIPBLASLT_ENABLE_LAZY_LOAD ON CACHE BOOL "Enable lazy loading of runtime code oject files to reduce ram usage.") - cmake_dependent_option(HIPBLASLT_ENABLE_MARKER "Use the marker library." ON "NOT WIN32" OFF) -@@ -207,17 +207,19 @@ if(HIPBLASLT_ENABLE_DEVICE) - endif() - - if(HIPBLASLT_ENABLE_ROCROLLER AND NOT TARGET roc::rocroller) -- find_package(rocroller REQUIRED) -+ find_package(rocroller QUIET) - option(YAML_CPP_INSTALL "" ON) - if(NOT rocroller_FOUND) -+ include(FetchContent) - set(ROCROLLER_ENABLE_FETCH ON) -- set(ROCROLLER_BUILD_TESTING OFF) -+ set(ROCROLLER_BUILD_TESTING ON) - set(ROCROLLER_ENABLE_CLIENT OFF) -+ set(ROCROLLER_ENABLE_GEMM_CLIENT_TESTS OFF) - FetchContent_Declare( - rocRoller - GIT_REPOSITORY https://github.com/ROCm/rocRoller.git -- GIT_TAG main -- SOURCE_SUBDIR next-cmake -+ GIT_TAG develop -+ SOURCE_SUBDIR - ) - FetchContent_MakeAvailable(rocRoller) - endif() -@@ -250,9 +252,9 @@ if(TENSILELITE_ENABLE_HOST OR HIPBLASLT_ENABLE_HOST) - - if(HIPBLASLT_ENABLE_MSGPACK) - if(msgpack-cxx_FOUND) -- target_link_libraries(tensilelite-host PRIVATE msgpack-cxx) -+ target_link_libraries(tensilelite-host PUBLIC msgpack-cxx) - else() -- target_link_libraries(tensilelite-host PRIVATE msgpackc) -+ target_link_libraries(tensilelite-host PUBLIC msgpackc) - endif() - target_compile_definitions(tensilelite-host PRIVATE TENSILE_MSGPACK) - endif() -@@ -265,7 +267,7 @@ if(TENSILELITE_ENABLE_HOST OR HIPBLASLT_ENABLE_HOST) - "${CMAKE_CURRENT_BINARY_DIR}/include" - ) - -- target_link_libraries(tensilelite-host -+ target_link_libraries(tensilelite-host - PUBLIC - rocisa::rocisa-cpp - PRIVATE -@@ -294,7 +296,7 @@ if(HIPBLASLT_ENABLE_HOST) - roc::${hipblas_target} - PRIVATE - hip::device -- tensilelite-host -+ tensilelite::tensilelite-host - ${CMAKE_DL_LIBS} - ${rocTracer} - ) -diff --git a/projects/hipblaslt/next-cmake/host-library/src/rocblaslt/CMakeLists.txt b/projects/hipblaslt/next-cmake/host-library/src/rocblaslt/CMakeLists.txt -index bd7d78695c..c710c28324 100644 ---- a/projects/hipblaslt/next-cmake/host-library/src/rocblaslt/CMakeLists.txt -+++ b/projects/hipblaslt/next-cmake/host-library/src/rocblaslt/CMakeLists.txt -@@ -45,11 +45,17 @@ if(HIPBLASLT_ENABLE_ROCROLLER) - ) - target_compile_features(hipblaslt-rocroller PRIVATE cxx_std_20) - set_target_properties(hipblaslt-rocroller PROPERTIES POSITION_INDEPENDENT_CODE ON) -- target_link_libraries(hipblaslt-rocroller PRIVATE hip::host roc::hipblas-common roc::rocroller) -+ target_link_libraries(hipblaslt-rocroller PRIVATE hip::host roc::hipblas-common roc::rocroller tensilelite::tensilelite-host) - target_compile_definitions(hipblaslt-rocroller PRIVATE HIPBLASLT_USE_ROCROLLER) - target_include_directories(hipblaslt-rocroller PRIVATE "${_CMAKE_CURRENT_SOURCE_DIR}/include") - target_include_directories(hipblaslt-rocroller PRIVATE "${_CMAKE_CURRENT_SOURCE_DIR}/../include") - target_include_directories(hipblaslt-rocroller PRIVATE "${_CMAKE_CURRENT_SOURCE_DIR}/../../../../include") - target_include_directories(hipblaslt-rocroller PRIVATE "${PROJECT_BINARY_DIR}/host-library/include") - target_link_libraries(hipblaslt PRIVATE hipblaslt-rocroller) -+ set_source_files_properties( -+ "${_CMAKE_CURRENT_SOURCE_DIR}/rocroller_host.cpp" -+ PROPERTIES -+ LANGUAGE CXX -+ COMPILE_OPTIONS "-x;c++" -+ ) - endif() --- -2.43.0 - diff --git a/patches/amd-mainline/rocm-libraries/0010-Correct-GTest-print-check-for-128-bit-ints-on-Window.patch b/patches/amd-mainline/rocm-libraries/0010-Correct-GTest-print-check-for-128-bit-ints-on-Window.patch deleted file mode 100644 index f07365b1f94..00000000000 --- a/patches/amd-mainline/rocm-libraries/0010-Correct-GTest-print-check-for-128-bit-ints-on-Window.patch +++ /dev/null @@ -1,56 +0,0 @@ -From a5d4103d389ff3b392eb98fed3460048dcfaf608 Mon Sep 17 00:00:00 2001 -From: Wayne Franz -Date: Fri, 1 Aug 2025 14:09:20 -0400 -Subject: [PATCH 10/17] Correct GTest print check for 128-bit ints on Windows - -Currently, on Windows, GTest cannot print 128-bit ints. -We have a check in test_utils::protected_assert_eq that -avoids calling ASSERT_EQ on 128-bit int values directly, -since this will cause the values to be printed in the -event of an error. - -This check was relying on the is_int128 alias, which was being -set to false_type when ROCPRIM_HAS_INT128_SUPPORT was false. -As a result, when 128-bit types were passed in, our check could -not detect them and would fail to stop the printing. - -In rocprim/types.hpp, the types rocprim::int128_t and rocprim::uint128_t -are now defined regardless of how ROCPRIM_HAS_INT128_SUPPORT is set. -This means we no longer need to guard against usage of these types -in our test code (we only need to use ROCPRIM_HAS_INT128_SUPPORT in cases -where we're doing some operation that explicitly won't work on 128-bit ints). - -This change removes the code that sets the is_int128 alias to false_type -when ROCPRIM_HAS_INT128_SUPPORT is not set. Doing this is enough -to fix the check in test_utils::protected_assert_eq. - -Cherry-picked from: https://github.com/ROCm/rocm-libraries/pull/1029 ---- - projects/rocprim/test/rocprim/test_utils_assertions.hpp | 7 ------- - 1 file changed, 7 deletions(-) - -diff --git a/projects/rocprim/test/rocprim/test_utils_assertions.hpp b/projects/rocprim/test/rocprim/test_utils_assertions.hpp -index 3bdb8c33a2..3d6a2d5c7e 100644 ---- a/projects/rocprim/test/rocprim/test_utils_assertions.hpp -+++ b/projects/rocprim/test/rocprim/test_utils_assertions.hpp -@@ -48,17 +48,10 @@ - - namespace test_utils { - --#if ROCPRIM_HAS_INT128_SUPPORT - template - using is_int128 = std::is_same::type>; - template - using is_uint128 = std::is_same::type>; --#else --template --using is_int128 = std::false_type; --template --using is_uint128 = std::false_type; --#endif // ROCPRIM_HAS_INT128_SUPPORT - - template - using is_double_custom_type = std::is_same::type, common::custom_type>; --- -2.43.0 - diff --git a/patches/amd-mainline/rocm-libraries/0011-Fix-finding-LAPACK-and-CBLAS.patch b/patches/amd-mainline/rocm-libraries/0011-Fix-finding-LAPACK-and-CBLAS.patch index 60567a760b2..5013c609de5 100644 --- a/patches/amd-mainline/rocm-libraries/0011-Fix-finding-LAPACK-and-CBLAS.patch +++ b/patches/amd-mainline/rocm-libraries/0011-Fix-finding-LAPACK-and-CBLAS.patch @@ -1,7 +1,7 @@ -From c194b05d6b1fb98da896784c580ecba5c36b1297 Mon Sep 17 00:00:00 2001 +From fa6af5c20b2d3700c3570ac8c4c5fea02e21e781 Mon Sep 17 00:00:00 2001 From: Marius Brehler -Date: Tue, 25 Mar 2025 12:13:13 +0000 -Subject: [PATCH 11/17] Fix finding LAPACK and CBLAS +Date: Tue, 26 Aug 2025 10:09:28 +0000 +Subject: [PATCH] Fix finding LAPACK and CBLAS Use target names provided by `find_package()` instead of assuming that the reference NETLIB libraries should be used. @@ -10,7 +10,7 @@ the reference NETLIB libraries should be used. 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/hipblas/clients/CMakeLists.txt b/projects/hipblas/clients/CMakeLists.txt -index a230135a10..b5e4eb0fb6 100644 +index 4ba4e950c9..6863e7c080 100644 --- a/projects/hipblas/clients/CMakeLists.txt +++ b/projects/hipblas/clients/CMakeLists.txt @@ -28,8 +28,8 @@ function( get_lapack lapack_lib lapack_inc ) @@ -33,15 +33,15 @@ index a230135a10..b5e4eb0fb6 100644 endif() endif() set( ${cblas_libs} ${libs} PARENT_SCOPE ) -@@ -221,7 +221,7 @@ if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) +@@ -236,7 +236,7 @@ if( BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) find_package( GTest REQUIRED ) message(STATUS "Build Dir: ${BUILD_DIR}") - message(STATUS "Linking Ref. Libs: ${BLAS_LIBRARY}") + message(STATUS "Linking Libs: ${BLAS_LIBRARY}") - if( BUILD_CLIENTS_TESTS ) - add_subdirectory( gtest ) + if( NOT TARGET hipblas ) + find_package( hipblas REQUIRED CONFIG PATHS ${HIPBLAS_LIBRARY_DIR} ) -- 2.43.0 diff --git a/patches/amd-mainline/rocm-libraries/0012-Work-around-race-condition.patch b/patches/amd-mainline/rocm-libraries/0012-Work-around-race-condition.patch deleted file mode 100644 index 62c7b1861c9..00000000000 --- a/patches/amd-mainline/rocm-libraries/0012-Work-around-race-condition.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 0e0efad44628e979ce6076ddb0ad7e193cb42a2f Mon Sep 17 00:00:00 2001 -From: Marius Brehler -Date: Fri, 21 Mar 2025 17:01:20 +0000 -Subject: [PATCH 12/17] Work around race condition -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -With `add_dependency`, compiling the `hipblas_fortran_client` target -fails with "Cannot open module file ‘hipblas.mod’ for reading at (1): No -such file or directory" as `hipblas.mod` is created in parallel but not -in time for the first run. Using `target_link_libraries` resovles this. ---- - projects/hipblas/clients/CMakeLists.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/projects/hipblas/clients/CMakeLists.txt b/projects/hipblas/clients/CMakeLists.txt -index b5e4eb0fb6..41364efb12 100644 ---- a/projects/hipblas/clients/CMakeLists.txt -+++ b/projects/hipblas/clients/CMakeLists.txt -@@ -118,7 +118,7 @@ if( BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_SAMPLES ) - else() - add_library(hipblas_fortran_client STATIC ${hipblas_f90_source_clients_no_solver}) - endif() -- add_dependencies(hipblas_fortran_client hipblas_fortran) -+ target_link_libraries(hipblas_fortran_client hipblas_fortran) - endif() - include_directories(${CMAKE_BINARY_DIR}/include/hipblas) - include_directories(${CMAKE_BINARY_DIR}/include) --- -2.43.0 - diff --git a/patches/amd-mainline/rocm-libraries/0013-Install-libhipblas_fortran.so.patch b/patches/amd-mainline/rocm-libraries/0013-Install-libhipblas_fortran.so.patch index 12ab3070fa9..12fb1ec6a37 100644 --- a/patches/amd-mainline/rocm-libraries/0013-Install-libhipblas_fortran.so.patch +++ b/patches/amd-mainline/rocm-libraries/0013-Install-libhipblas_fortran.so.patch @@ -1,26 +1,26 @@ -From 7e26c6c64ba19d9a7d6dcb736aba374407df0673 Mon Sep 17 00:00:00 2001 +From 20d3866866edae3398861692e2af98f4814f8010 Mon Sep 17 00:00:00 2001 From: Marius Brehler -Date: Tue, 1 Apr 2025 20:58:38 +0000 -Subject: [PATCH 13/17] Install `libhipblas_fortran.so` +Date: Tue, 26 Aug 2025 10:15:45 +0000 +Subject: [PATCH] Install `libhipblas_fortran.so` This is required by the test and benchmark clients but was not part of the installation so far. --- - projects/hipblas/library/src/CMakeLists.txt | 1 + + projects/hipblas/clients/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) -diff --git a/projects/hipblas/library/src/CMakeLists.txt b/projects/hipblas/library/src/CMakeLists.txt -index 7318e3fd88..8846fec9d3 100755 ---- a/projects/hipblas/library/src/CMakeLists.txt -+++ b/projects/hipblas/library/src/CMakeLists.txt -@@ -54,6 +54,7 @@ set (hipblas_f90_source - # Create hipBLAS Fortran module - if(NOT WIN32) - add_library(hipblas_fortran ${hipblas_f90_source}) -+ rocm_install(TARGETS hipblas_fortran) - endif() +diff --git a/projects/hipblas/clients/CMakeLists.txt b/projects/hipblas/clients/CMakeLists.txt +index 4ba4e950c9..e17ce117d2 100644 +--- a/projects/hipblas/clients/CMakeLists.txt ++++ b/projects/hipblas/clients/CMakeLists.txt +@@ -121,6 +121,7 @@ if( BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_SAMPLES ) + # Set Fortran module output directory + set(CMAKE_Fortran_MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/include/hipblas) + add_library(hipblas_fortran OBJECT ${hipblas_f90_source}) ++ rocm_install(TARGETS hipblas_fortran) + endif() - if(BUILD_ADDRESS_SANITIZER) + if( NOT WIN32 ) -- 2.43.0 diff --git a/patches/amd-mainline/rocm-libraries/0020-Add-missing-internals-export.patch b/patches/amd-mainline/rocm-libraries/0020-Add-missing-internals-export.patch deleted file mode 100644 index 394cb7ba743..00000000000 --- a/patches/amd-mainline/rocm-libraries/0020-Add-missing-internals-export.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 05f7bd80b25853b26f1c63dcda570e3234e2ed6b Mon Sep 17 00:00:00 2001 -From: Brian Harrison -Date: Thu, 7 Aug 2025 20:43:58 +0000 -Subject: [PATCH] Add missing internals export - ---- - projects/miopen/src/include/miopen/conv/solvers.hpp | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/projects/miopen/src/include/miopen/conv/solvers.hpp b/projects/miopen/src/include/miopen/conv/solvers.hpp -index 513b5552f2..5fa56cb8da 100644 ---- a/projects/miopen/src/include/miopen/conv/solvers.hpp -+++ b/projects/miopen/src/include/miopen/conv/solvers.hpp -@@ -4556,7 +4556,8 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; -- float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -+ MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, -+ const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; --- -2.43.0 - diff --git a/patches/amd-mainline/rocm-libraries/0021-Use-workgroupMappingDim-in-rocroller_host.patch b/patches/amd-mainline/rocm-libraries/0021-Use-workgroupMappingDim-in-rocroller_host.patch new file mode 100644 index 00000000000..fa5549691a0 --- /dev/null +++ b/patches/amd-mainline/rocm-libraries/0021-Use-workgroupMappingDim-in-rocroller_host.patch @@ -0,0 +1,27 @@ +From 6187c53459567244256d102882dc417ac24c739a Mon Sep 17 00:00:00 2001 +From: David Dixon +Date: Wed, 20 Aug 2025 20:59:07 +0000 +Subject: [PATCH 17/17] Use workgroupMappingDim in rocroller_host + +The api for command solution changed so +use the workgroupMappingDim rather than workgroupMapping +--- + .../library/src/amd_detail/rocblaslt/src/rocroller_host.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/rocroller_host.cpp b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/rocroller_host.cpp +index c505f9122d..d724d600e7 100644 +--- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/rocroller_host.cpp ++++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/rocroller_host.cpp +@@ -1369,7 +1369,7 @@ std::shared_ptr genGemmKernel(std::shared_ptr ge + "Only 0 (M) or 1 (N) are supported dimensions for workgroup mapping.", + ShowValue(dim)); + +- params->workgroupMapping = {dim, nullptr}; ++ params->workgroupMappingDim = dim; + } + + if(gemm->workgroupRemapXCC) +-- +2.25.1 + diff --git a/patches/amd-mainline/rocm-libraries/0022-Revert-fft-compile-link-device-callback-functions-wi.patch b/patches/amd-mainline/rocm-libraries/0022-Revert-fft-compile-link-device-callback-functions-wi.patch new file mode 100644 index 00000000000..f6121ac4bf0 --- /dev/null +++ b/patches/amd-mainline/rocm-libraries/0022-Revert-fft-compile-link-device-callback-functions-wi.patch @@ -0,0 +1,118 @@ +From 26d7d8da8f2a091175ac702ea17347bf88a5f825 Mon Sep 17 00:00:00 2001 +From: Marius Brehler +Date: Tue, 26 Aug 2025 16:42:49 +0000 +Subject: [PATCH] Revert "fft: compile/link device callback functions with + -fgpu-rdc (#1103)" + +This reverts commit db31de0fb97a0aecbeeaedafa755b50f6f0d5451. +--- + projects/hipfft/clients/samples/CMakeLists.txt | 12 ++++-------- + projects/hipfft/clients/tests/CMakeLists.txt | 8 -------- + .../rocfft/clients/samples/rocfft/CMakeLists.txt | 4 ---- + projects/rocfft/clients/tests/CMakeLists.txt | 4 ---- + projects/rocfft/docs/how-to/load-store-callbacks.rst | 5 ----- + projects/rocfft/library/src/CMakeLists.txt | 5 ----- + 6 files changed, 4 insertions(+), 34 deletions(-) + +diff --git a/projects/hipfft/clients/samples/CMakeLists.txt b/projects/hipfft/clients/samples/CMakeLists.txt +index 6eb7c92f3f..488e09de85 100644 +--- a/projects/hipfft/clients/samples/CMakeLists.txt ++++ b/projects/hipfft/clients/samples/CMakeLists.txt +@@ -124,12 +124,8 @@ foreach( sample ${sample_list} ) + + endforeach() + +-# callback code must be compiled as relocatable device code +-if( hipfft_callback IN_LIST sample_list ) +- if( BUILD_WITH_LIB STREQUAL "CUDA" ) +- target_compile_options( hipfft_callback PRIVATE -dc ) +- else() +- target_compile_options( hipfft_callback PRIVATE -fgpu-rdc ) +- target_link_options( hipfft_callback PRIVATE -fgpu-rdc ) +- endif() ++# cuFFT callback code must be compiled with -dc to enable relocatable ++# device code ++if( BUILD_WITH_LIB STREQUAL "CUDA" AND hipfft_callback IN_LIST sample_list ) ++ target_compile_options( hipfft_callback PRIVATE -dc ) + endif() +diff --git a/projects/hipfft/clients/tests/CMakeLists.txt b/projects/hipfft/clients/tests/CMakeLists.txt +index a2fed1c3bc..7ce9b49987 100644 +--- a/projects/hipfft/clients/tests/CMakeLists.txt ++++ b/projects/hipfft/clients/tests/CMakeLists.txt +@@ -232,14 +232,6 @@ else() + target_link_libraries( hipfft-test PRIVATE ${GTEST_LIBRARIES} ) + endif() + +-# tests have callback functions, which need to be built as relocatable device code +-if( BUILD_WITH_LIB STREQUAL "CUDA" ) +- target_compile_options( hipfft-test PRIVATE -dc ) +-else() +- target_compile_options( hipfft-test PRIVATE -fgpu-rdc ) +- target_link_options( hipfft-test PRIVATE -fgpu-rdc ) +-endif() +- + if(FFTW_MULTITHREAD) + target_compile_options( hipfft-test PRIVATE -DFFTW_MULTITHREAD ) + endif( ) +diff --git a/projects/rocfft/clients/samples/rocfft/CMakeLists.txt b/projects/rocfft/clients/samples/rocfft/CMakeLists.txt +index bfea7cea57..d07b22c922 100644 +--- a/projects/rocfft/clients/samples/rocfft/CMakeLists.txt ++++ b/projects/rocfft/clients/samples/rocfft/CMakeLists.txt +@@ -116,7 +116,3 @@ foreach( sample ${sample_list} ) + target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) + + endforeach( ) +- +-# callback functions need to be built as relocatable device code +-target_compile_options( rocfft_example_callback PRIVATE -fgpu-rdc ) +-target_link_options( rocfft_example_callback PRIVATE -fgpu-rdc ) +diff --git a/projects/rocfft/clients/tests/CMakeLists.txt b/projects/rocfft/clients/tests/CMakeLists.txt +index f6e065e93b..3e49f2345a 100644 +--- a/projects/rocfft/clients/tests/CMakeLists.txt ++++ b/projects/rocfft/clients/tests/CMakeLists.txt +@@ -100,10 +100,6 @@ add_executable( rtc_helper_crash rtc_helper_crash.cpp ) + # of a mismatch + target_compile_options( rocfft-test PRIVATE -Xarch_device -O3 ) + +-# callback functions need to be built as relocatable device code +-target_compile_options( rocfft-test PRIVATE -fgpu-rdc ) +-target_link_options( rocfft-test PRIVATE -fgpu-rdc ) +- + find_package( Boost REQUIRED ) + set( Boost_DEBUG ON ) + set( Boost_DETAILED_FAILURE_MSG ON ) +diff --git a/projects/rocfft/docs/how-to/load-store-callbacks.rst b/projects/rocfft/docs/how-to/load-store-callbacks.rst +index a8ee6ed253..9d37995443 100644 +--- a/projects/rocfft/docs/how-to/load-store-callbacks.rst ++++ b/projects/rocfft/docs/how-to/load-store-callbacks.rst +@@ -17,11 +17,6 @@ to the library using + :cpp:func:`rocfft_execution_info_set_load_callback` and + :cpp:func:`rocfft_execution_info_set_store_callback`. + +-.. note:: +- +- Callback functions must be built as relocatable device code by +- passing the ``-fgpu-rdc`` option to the compiler and linker. +- + Device functions supplied as callbacks must load and store element + data types appropriate for the transform being executed. + +diff --git a/projects/rocfft/library/src/CMakeLists.txt b/projects/rocfft/library/src/CMakeLists.txt +index 69beaa26a2..943facac61 100644 +--- a/projects/rocfft/library/src/CMakeLists.txt ++++ b/projects/rocfft/library/src/CMakeLists.txt +@@ -401,11 +401,6 @@ add_library( rocfft + ) + rocfft_add_coverage_flags( rocfft ) + +-# rocFFT contains default implementations of callback functions that +-# need to be built as relocatable device code +-target_compile_options( rocfft PRIVATE -fgpu-rdc ) +-target_link_options( rocfft PRIVATE -fgpu-rdc ) +- + if( ROCFFT_MPI_ENABLE ) + target_compile_definitions(rocfft PRIVATE ROCFFT_MPI_ENABLE) + include_directories(SYSTEM ${MPI_INCLUDE_PATH}) +-- +2.43.0 + diff --git a/patches/amd-mainline/rocm-libraries/0023-Revert-hipblaslt-Updated-tuned-TF32-NN-TN-TT-librari.patch b/patches/amd-mainline/rocm-libraries/0023-Revert-hipblaslt-Updated-tuned-TF32-NN-TN-TT-librari.patch new file mode 100644 index 00000000000..bd8504d1bc0 --- /dev/null +++ b/patches/amd-mainline/rocm-libraries/0023-Revert-hipblaslt-Updated-tuned-TF32-NN-TN-TT-librari.patch @@ -0,0 +1,96930 @@ +From 6ecb58daefb9492353f18d6dd400268905ae56f5 Mon Sep 17 00:00:00 2001 +From: Marius Brehler +Date: Wed, 27 Aug 2025 21:11:53 +0000 +Subject: [PATCH] Revert "[hipblaslt] Updated, tuned TF32 NN, TN, TT libraries + w/ Custom kernels. (#1345)" + +This reverts commit 9e7533dfe612c30e65e4fea738a9b7b89ad40ba8. +--- + ...ilk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml | 23252 +++++++++++++--- + ...lik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml | 23100 ++++++++++++--- + ...lik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml | 22762 ++++++++++++--- + 3 files changed, 55933 insertions(+), 13181 deletions(-) + +diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +index 430357d429..8a5530e0f2 100644 +--- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml ++++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +@@ -1,7 +1,7 @@ + - {MinimumRequiredVersion: 5.0.0} + - gfx950 + - gfx950 +-- [Device 0049, Device 0050] ++- [Device 75a0] + - Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false +@@ -265,7 +265,6 @@ + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 +- UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 +@@ -301,9 +300,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -312,24 +311,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32Sm0b0eKQKLH-60gpTEkGyXymNjrsax7ksmaPyKvksRg= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16Op9E9FC-l3yIwRT22XZdhjYBf64YD9-IfBTWgp64-4k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -337,7 +337,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -354,36 +354,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 +- LSCB: 16 ++ LSCB: 32 + LSPA: 4 +- LSPB: 16 ++ LSPB: 32 + LVCA: 64 +- LVCB: 16 ++ LVCB: 8 + LVPA: 1 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 68608 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -392,10 +392,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -403,26 +403,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveTile: [8, 7] ++ MIWaveTileA: 8 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 256 ++ MacroTile1: 224 + MacroTileA: 256 +- MacroTileB: 256 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -431,20 +431,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 256 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 4 +- NumLoadsB: 16 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 8 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -452,18 +452,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 0 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -471,17 +471,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 4 +- ThreadTileA: 64 +- ThreadTileB: 4 ++ ThreadTile0: 32 ++ ThreadTile1: 7 ++ ThreadTileA: 32 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -497,22 +497,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -527,7 +527,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -536,7 +536,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32MhjubPpVa8YBCgZP8HMQjS0vNgMO0nGHqCiuyY2PFls= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16VQYetxAJaGGALE1_C5NmMqKkFbrooMzr-iFIIi74xgY= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -546,14 +546,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -561,12 +562,12 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -578,57 +579,57 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 +- LSCB: 16 ++ LSCB: 32 + LSPA: 4 +- LSPB: 16 ++ LSPB: 32 + LVCA: 64 +- LVCB: 16 ++ LVCB: 8 + LVPA: 1 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 62208 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 62208 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 13056 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 6] +- MIWaveTileA: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 6] ++ MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 +@@ -640,14 +641,14 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -655,20 +656,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 4 +- NumLoadsB: 12 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 8 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -676,26 +677,26 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 1 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -720,23 +721,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -751,7 +752,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -760,24 +761,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32XPH7m2bfGp5nPHeVyV9d6s-DvdBtRAJLSvvWS02tAYY= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16BWF29vNfxVjvcl8KZfqqnCEH5fcXtWAn5GRLzwiTr44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -802,36 +804,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 +- LSCB: 16 ++ LSCB: 32 + LSPA: 4 +- LSPB: 64 ++ LSPB: 32 + LVCA: 64 +- LVCB: 4 ++ LVCB: 8 + LVPA: 1 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 123904 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 123904 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -840,10 +842,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -851,26 +853,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [8, 5] ++ MIWaveTileA: 8 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 128 ++ MacroTile1: 160 + MacroTileA: 256 +- MacroTileB: 128 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -879,20 +881,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 2 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 8 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -901,17 +903,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 2 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -919,17 +921,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 32 ++ ThreadTile1: 5 ++ ThreadTileA: 32 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -945,22 +947,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -973,9 +975,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -984,24 +986,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32xYVn-B9ZHrkGdv91xnDpVwDR4gGD16m4jceSqIW-jZR8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16ZyPw_-PJti-1T3O3Ts5XPxcFG1Uesexa645ULZGo5o4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1009,12 +1012,12 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -1026,36 +1029,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 256 +- LSCB: 16 ++ LSCB: 32 + LSPA: 4 +- LSPB: 16 ++ LSPB: 32 + LVCA: 64 +- LVCB: 16 ++ LVCB: 8 + LVPA: 1 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53760 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 53760 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 +- LdsOffsetBias: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -1064,37 +1067,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 4] ++ MIWaveTileA: 8 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 64 ++ MacroTile1: 128 + MacroTileA: 256 +- MacroTileB: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -1103,19 +1106,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 ++ NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 ++ NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -1125,25 +1128,25 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 3 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -1151,9 +1154,9 @@ + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 +- ThreadTile1: 2 ++ ThreadTile1: 4 + ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1168,23 +1171,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1197,9 +1200,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1208,7 +1211,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x16_MI32zRAp0CCmxjXBYYh5ze6eQoUUJ4bThioR_n9oinxr_hc= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16xevTgaRWgewooZSdioyMIpJUNWB24-zXSoy_Rk_zNfR8= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -1218,29 +1221,30 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -1250,48 +1254,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 ++ LSCA: 256 ++ LSCB: 32 + LSPA: 4 +- LSPB: 16 ++ LSPB: 32 + LVCA: 64 +- LVCB: 16 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 62464 ++ LdsBytesNoAmax: 48128 + LdsInitCVgprs: false +- LdsNumBytes: 62464 +- LdsNumElementsAlignedA: 12288 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 48128 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12288 +- LdsOffsetB_Blk: 45056 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12288 +- LdsOffsetMetadata_Blk: 45056 ++ LdsOffsetMetadata: 48128 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1299,27 +1303,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveTile: [8, 3] ++ MIWaveTileA: 8 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 256 +- MacroTileA: 192 +- MacroTileB: 256 ++ MacroTile0: 256 ++ MacroTile1: 96 ++ MacroTileA: 256 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1327,20 +1331,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 +- NumLoadsA: 12 +- NumLoadsB: 16 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 8 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1348,36 +1352,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 4 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 4 +- ThreadTileA: 48 +- ThreadTileB: 4 ++ ThreadTile0: 32 ++ ThreadTile1: 3 ++ ThreadTileA: 32 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1392,23 +1396,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1421,9 +1425,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1432,24 +1436,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x16_MI3268o_YUuWDCULnS-6OfMhWZIZyCXM-rMwjSkjln6_m2M= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16xgeMizMvrQIy5F9bD-WBhYw-akhNTuYP1EwUWXaBysF0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1457,12 +1462,12 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -1474,48 +1479,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 32 ++ LVCA: 64 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58112 ++ LdsBytesNoAmax: 41984 + LdsInitCVgprs: false +- LdsNumBytes: 58112 +- LdsNumElementsAlignedA: 12288 +- LdsNumElementsAlignedB: 13056 ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12288 +- LdsOffsetB_Blk: 45056 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12288 +- LdsOffsetMetadata_Blk: 45056 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1523,27 +1528,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [8, 2] ++ MIWaveTileA: 8 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 256 ++ MacroTile1: 64 ++ MacroTileA: 256 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1551,20 +1556,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 3 +- NumLoadsB: 12 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1572,36 +1577,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 5 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 +- SuppressNoLoadLoop: false +- ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 32 ++ ThreadTile1: 2 ++ ThreadTileA: 32 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1616,23 +1621,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1645,9 +1650,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1656,7 +1661,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32StQr_hosq-fOslL8Bq9BD4_0oobkEcKwUkzBTyMAuV8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16xKuDIHlgttL3yDRqSCMREa2yA7Yuz-to8XPKRaB4Zi1A= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -1666,29 +1671,30 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -1698,48 +1704,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 ++ LSCA: 256 ++ LSCB: 32 + LSPA: 4 +- LSPB: 64 ++ LSPB: 32 + LVCA: 64 +- LVCB: 4 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LVCB: 8 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53376 ++ LdsBytesNoAmax: 37888 + LdsInitCVgprs: false +- LdsNumBytes: 53376 +- LdsNumElementsAlignedA: 12288 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 37888 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12288 +- LdsOffsetB_Blk: 45056 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12288 +- LdsOffsetMetadata_Blk: 45056 ++ LdsOffsetMetadata: 37888 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1747,27 +1753,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1775,20 +1781,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 12 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1802,30 +1808,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 6 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 32 ++ ThreadTile1: 1 ++ ThreadTileA: 32 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1840,23 +1846,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1869,9 +1875,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1880,31 +1886,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32xoWvI5fNPqUnqOoIU1C1ORMnARKwPNDmtR2SiqFHg2Nc= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16P2n3oReLVxyNwvGOFq4AFplHUS5EI-52DywTP1Y4h1I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -1912,7 +1919,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -1922,36 +1929,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 +- LSPA: 4 +- LSPB: 64 +- LVCA: 64 +- LVCB: 4 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 16896 ++ LdsBytesNoAmax: 129536 + LdsInitCVgprs: false +- LdsNumBytes: 16896 +- LdsNumElementsAlignedA: 12288 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 129536 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12288 +- LdsOffsetB_Blk: 45056 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 45056 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 29184 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -1960,10 +1967,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1971,26 +1978,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [7, 8] ++ MIWaveTileA: 7 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 224 ++ MacroTile1: 256 ++ MacroTileA: 224 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -1999,20 +2006,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 12 +- NumLoadsB: 1 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 224 ++ NumLoadsA: 7 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2021,17 +2028,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 7 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -2039,17 +2046,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 28 ++ ThreadTile1: 8 ++ ThreadTileA: 28 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2065,22 +2072,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2095,7 +2102,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2104,24 +2111,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x16_MI32q1hRR2VchMX141_kJvhQ2tSW_Epw9QA5Z9OiaVc3thI= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16dxeNeAAbN_fn8sa9A5stfenZidFXL-f90RWM80Wde4o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -2146,36 +2154,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 16 +- LSPA: 8 +- LSPB: 64 +- LVCA: 32 +- LVCB: 4 +- LVPA: 2 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 130560 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 130560 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 29184 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2184,37 +2192,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [1, 8] +- MIWaveTileA: 1 +- MIWaveTileB: 8 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 7] ++ MIWaveTileA: 7 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 224 ++ MacroTile1: 224 ++ MacroTileA: 224 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2223,20 +2231,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 +- NonTemporalE: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 128 +- NumLoadsA: 2 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 196 ++ NumGlobalWriteVectorsPerThread: 196 ++ NumLoadsA: 7 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2245,17 +2253,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 8 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -2263,17 +2271,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 8 +- ThreadTileA: 16 +- ThreadTileB: 8 ++ ThreadTile0: 28 ++ ThreadTile1: 7 ++ ThreadTileA: 28 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2289,22 +2297,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2319,7 +2327,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2328,7 +2336,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32bhIZ5KxYu1ycDpmXtpyp4P_uoZcClKvmYyn394BY15g= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16hzZqm22vx6_DN3DNriYR2k5VmUtqnZJTUoDiUKpPnd8= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -2338,29 +2346,30 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -2370,48 +2379,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 16 +- LSPA: 2 +- LSPB: 64 +- LVCA: 128 +- LVCB: 4 +- LVPA: 2 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53440 ++ LdsBytesNoAmax: 122368 + LdsInitCVgprs: false +- LdsNumBytes: 53440 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 12480 ++ LdsNumBytes: 122368 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 29184 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2419,27 +2428,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 3] +- MIWaveTileA: 2 +- MIWaveTileB: 3 ++ MIWaveTile: [7, 6] ++ MIWaveTileA: 7 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 128 ++ MacroTile0: 224 + MacroTile1: 192 +- MacroTileA: 128 ++ MacroTileA: 224 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -2447,20 +2456,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 3 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 168 ++ NumLoadsA: 7 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 3 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2474,30 +2483,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 9 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 3 +- ThreadTileA: 32 +- ThreadTileB: 3 ++ ThreadTile0: 28 ++ ThreadTile1: 6 ++ ThreadTileA: 28 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2512,23 +2521,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2543,7 +2552,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2552,39 +2561,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32qaf0u37ok9Nk_pb4X6_4dRVvY2KMiZHpB1VT0_ke35o= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16Mg5TF7AfbON8D_eOQtpsT1Nj28OWS1uKHrYiF4d_MI8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -2594,36 +2604,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 16 +- LSPA: 2 +- LSPB: 16 +- LVCA: 128 +- LVCB: 16 +- LVPA: 2 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 50176 ++ LdsBytesNoAmax: 120320 + LdsInitCVgprs: false +- LdsNumBytes: 50176 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 120320 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 29184 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2632,10 +2642,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2643,26 +2653,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveTile: [7, 5] ++ MIWaveTileA: 7 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 160 ++ MacroTileA: 224 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2671,20 +2681,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 7 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2693,35 +2703,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 10 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 5 ++ ThreadTileA: 28 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2736,23 +2746,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2765,9 +2775,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2776,24 +2786,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32x2c2hkDyXiye6C0VNO9NrSXRKJLvVC4DIxiTWkzYASGw= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16YJd4m2BS85_ESZYfZ9ipCJbFLKQ364Lu4ab5FUj7PdU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -2818,36 +2829,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 16 +- LSPA: 8 +- LSPB: 64 +- LVCA: 32 +- LVCB: 4 +- LVPA: 2 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 29184 ++ LdsBytesNoAmax: 46592 + LdsInitCVgprs: false +- LdsNumBytes: 29184 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 46592 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 24576 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 24576 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 46592 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2856,37 +2867,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [1, 2] +- MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 +- MagicDivAlg: 2 ++ MacroTile0: 224 ++ MacroTile1: 128 ++ MacroTileA: 224 ++ MacroTileB: 128 ++ MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2895,20 +2906,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 2 +- NumLoadsB: 1 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2917,17 +2928,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 11 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -2935,17 +2946,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2961,22 +2972,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2991,7 +3002,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3000,31 +3011,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32xdpo4Elh36uHFvCocZBZQboRtvEVnv6_GCle-iJDBqq0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x54_hDmqKabtIEC5r9ASqT0jlSsSxi50kwwapyX0U5D0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -3032,7 +3044,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -3042,36 +3054,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 +- LSPA: 4 +- LSPB: 64 +- LVCA: 64 +- LVCB: 4 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 21504 ++ LdsBytesNoAmax: 44544 + LdsInitCVgprs: false +- LdsNumBytes: 21504 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 44544 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 36864 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 21504 +- LdsOffsetMetadata_Blk: 36864 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 44544 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3080,10 +3092,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3091,26 +3103,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 4] +- MIWaveTileA: 1 +- MIWaveTileB: 4 ++ MIWaveTile: [7, 3] ++ MIWaveTileA: 7 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 224 ++ MacroTile1: 96 ++ MacroTileA: 224 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -3119,20 +3131,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 4 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 7 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3141,17 +3153,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 12 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3159,17 +3171,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 4 +- ThreadTileA: 16 +- ThreadTileB: 4 ++ ThreadTile0: 28 ++ ThreadTile1: 3 ++ ThreadTileA: 28 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3185,22 +3197,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3213,9 +3225,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3224,39 +3236,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32xe-6pJxoR0HuhgKOBDzCm0i1EffMHG1wwAvt4oUYC6TI= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x_Ssic5ypdDbqEFWb_Fh7ekj1kaXqU8vXYGs-nnLOLWk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -3266,48 +3279,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 +- LSPA: 4 +- LSPB: 16 +- LVCA: 64 +- LVCB: 16 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49920 ++ LdsBytesNoAmax: 38400 + LdsInitCVgprs: false +- LdsNumBytes: 49920 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 13056 ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 36864 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4096 +- LdsOffsetMetadata_Blk: 36864 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3315,27 +3328,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -3343,20 +3356,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 4 +- NumLoadsB: 12 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 7 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3365,17 +3378,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 13 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3383,17 +3396,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3409,22 +3422,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3437,9 +3450,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3448,31 +3461,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32xgg9R4PXCfJzpoUyoWadf1lqwU3STwBFNIL1mA6hiXZY= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16xKQv0jVw6NOH8Fp3p-lgFrGG3G1EYTA2Tz-pfcAKNSq0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -3480,7 +3494,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -3490,36 +3504,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 16 +- LSPA: 4 +- LSPB: 64 +- LVCA: 64 +- LVCB: 4 +- LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 29696 ++ LdsBytesNoAmax: 34304 + LdsInitCVgprs: false +- LdsNumBytes: 29696 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 34304 ++ LdsNumElementsAlignedA: 29184 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 20480 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 29184 ++ LdsOffsetB_Blk: 94720 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4096 +- LdsOffsetMetadata_Blk: 20480 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 34304 ++ LdsOffsetMetadata_Blk: 94720 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3528,10 +3542,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3539,26 +3553,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 2] +- MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -3567,20 +3581,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 7 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3589,17 +3603,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 14 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3607,17 +3621,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 1 ++ ThreadTileA: 28 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3633,22 +3647,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3663,7 +3677,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3672,7 +3686,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x3fdCiZE1jMEdMA9HAEkdVFidjRks_9sDYrEgCeQBfRF8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16NPomCbun5g5X8-ycG4osmpTalrQIZNO9Z-xvLtPUfes= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -3682,29 +3696,30 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -3714,36 +3729,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 16 +- LSPA: 4 +- LSPB: 64 +- LVCA: 64 +- LVCB: 4 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 16 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 25088 ++ LdsBytesNoAmax: 124928 + LdsInitCVgprs: false +- LdsNumBytes: 25088 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 124928 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 20480 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4096 +- LdsOffsetMetadata_Blk: 20480 ++ LdsOffsetMetadata: 24576 ++ LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3752,10 +3767,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3763,48 +3778,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTile: [6, 8] ++ MIWaveTileA: 6 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 256 ++ MacroTileA: 192 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 +- NumLoadsB: 1 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 192 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 6 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3812,36 +3827,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 15 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 8 ++ ThreadTileA: 24 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3856,23 +3871,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3887,7 +3902,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3896,39 +3911,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x3A0PaqG4By4CV5ns1S0GisnKCgFqOoWurPUoMID_agyM= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI162KgKyKezUzdryApAlBbzizztsW66klpErIDoJMDJ1iw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -3938,36 +3954,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 32 +- LSCB: 16 +- LSPA: 4 ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 + LSPB: 32 +- LVCA: 32 +- LVCB: 4 ++ LVCA: 16 ++ LVCB: 8 + LVPA: 4 + LVPB: 8 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 14848 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 14848 +- LdsNumElementsAlignedA: 2048 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 2048 +- LdsOffsetB_Blk: 10240 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 2048 +- LdsOffsetMetadata_Blk: 10240 ++ LdsOffsetMetadata: 24576 ++ LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3976,60 +3992,60 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 7] ++ MIWaveTileA: 6 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 224 ++ MacroTileA: 192 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 6 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -4037,35 +4053,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 16 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 7 ++ ThreadTileA: 24 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4080,23 +4096,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4111,7 +4127,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4120,7 +4136,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x3D1nsyEjx7S7ZBTQpPzBm5N2WWyBE4WAjD-voqIZb10g= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16zEjMDqk8GUpljOHU9Tznod3l-qUxdWN5Xp5GYBPPEMA= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -4130,29 +4146,30 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -4162,36 +4179,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 16 +- LSPA: 2 ++ LSCB: 32 ++ LSPA: 16 + LSPB: 32 +- LVCA: 64 +- LVCB: 4 +- LVPA: 2 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 + LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 14592 ++ LdsBytesNoAmax: 117760 + LdsInitCVgprs: false +- LdsNumBytes: 14592 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 2304 ++ LdsNumBytes: 117760 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 12288 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4096 +- LdsOffsetMetadata_Blk: 12288 ++ LdsOffsetMetadata: 24576 ++ LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -4200,37 +4217,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 +- MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 6] ++ MIWaveTileA: 6 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 192 ++ MacroTileA: 192 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4239,21 +4256,21 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 1 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 1 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -4266,30 +4283,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 17 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 6 ++ ThreadTileA: 24 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4304,23 +4321,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4333,9 +4350,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4344,24 +4361,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32FECJNdqE4dlr93iFlCAeQT2bPBaDgMRI0XQ06xnEYVs= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16Y6lRGFnidkNajK3PdK-VBnAFfZAXLqV7Kg6aNpLIWTY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4374,7 +4392,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -4386,48 +4404,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 64 + LSCB: 32 +- LSPA: 4 ++ LSPA: 16 + LSPB: 32 +- LVCA: 64 ++ LVCA: 16 + LVCB: 8 +- LVPA: 1 ++ LVPA: 4 + LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 123264 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 123264 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 24960 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98304 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -4435,27 +4453,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveTile: [6, 5] ++ MIWaveTileA: 6 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 192 +- MacroTileA: 256 +- MacroTileB: 192 ++ MacroTile0: 192 ++ MacroTile1: 160 ++ MacroTileA: 192 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4463,20 +4481,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 6 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4484,14 +4502,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 20 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4500,20 +4518,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 3 +- ThreadTileA: 64 +- ThreadTileB: 3 ++ ThreadTile0: 24 ++ ThreadTile1: 5 ++ ThreadTileA: 24 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4528,17 +4546,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4557,9 +4575,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4568,7 +4586,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI16AJztYbHpdmewINeyCLwwj6EmLaSY8yILSCN3cVFzJkI= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16Ui0bhGCFa66JDviV6lT8dQfQOb8riCo1OQKtLBn1H9Q= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -4579,13 +4597,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4598,7 +4617,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -4610,34 +4629,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 64 + LSCB: 32 +- LSPA: 4 ++ LSPA: 16 + LSPB: 32 +- LVCA: 64 ++ LVCA: 16 + LVCB: 8 +- LVPA: 1 ++ LVPA: 4 + LVPB: 8 +- LdsBlockSizePerPadA: 4096 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 41984 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98304 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -4645,8 +4664,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -4658,14 +4677,14 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [4, 8] +- MIWaveTileA: 4 +- MIWaveTileB: 8 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 4] ++ MIWaveTileA: 6 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 256 ++ MacroTile0: 192 + MacroTile1: 128 +- MacroTileA: 256 ++ MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 +@@ -4678,8 +4697,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4687,19 +4706,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 + NumLoadsB: 4 +- NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -4714,8 +4733,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 21 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4724,20 +4743,14195 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 16 +- SubGroup1: 16 +- SubGroupA: 16 +- SubGroupB: 16 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 4 ++ ThreadTileA: 24 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x3UX6Ot-zbrkVheKPZPcv9YFUni3KWrpFbNf7Le1KSco= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 39936 ++ LdsInitCVgprs: false ++ LdsNumBytes: 39936 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 39936 ++ LdsOffsetMetadata_Blk: 90112 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16xHk6L8TtDRgVnxipvvts3g7MyH4fl9TFhip4k32FY2XE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 33792 ++ LdsInitCVgprs: false ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 90112 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 90112 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 64 ++ MacroTileA: 192 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 6 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 2 ++ ThreadTileA: 24 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16xRgCF1GKbk2DFnwv-0XW3NMLI7ofWgVakR6S04b-cFiM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 29696 ++ LdsInitCVgprs: false ++ LdsNumBytes: 29696 ++ LdsNumElementsAlignedA: 24576 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 24576 ++ LdsOffsetB_Blk: 57344 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 29696 ++ LdsOffsetMetadata_Blk: 57344 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 32 ++ MacroTileA: 192 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 6 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16zpN8L3fRY-rhAIwGIEqCePWQwlmMsAvDgb7NUclsqcs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 121344 ++ LdsInitCVgprs: false ++ LdsNumBytes: 121344 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 20992 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 8] ++ MIWaveTileA: 5 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 256 ++ MacroTileA: 160 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 160 ++ NumLoadsA: 5 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 8 ++ ThreadTileA: 20 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI169ky7B2ES_nmAj9G9Hs7YRmoEHE87LhLIKd729jWf3DY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 122368 ++ LdsInitCVgprs: false ++ LdsNumBytes: 122368 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 20992 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 7] ++ MIWaveTileA: 5 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 224 ++ MacroTileA: 160 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 5 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 7 ++ ThreadTileA: 20 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16GeNw0oPGLEldGmfqzzl266oOMGiSNtVLRyqtJggID3M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 48640 ++ LdsInitCVgprs: false ++ LdsNumBytes: 48640 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 48640 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 6] ++ MIWaveTileA: 5 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 192 ++ MacroTileA: 160 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 120 ++ NumLoadsA: 5 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 6 ++ ThreadTileA: 20 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16RkfXGWMPEkSn2szl0l5qfA2ILE8ut8wt57SusZZER7k= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 46592 ++ LdsInitCVgprs: false ++ LdsNumBytes: 46592 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 46592 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 5] ++ MIWaveTileA: 5 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 160 ++ MacroTileA: 160 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 100 ++ NumGlobalWriteVectorsPerThread: 100 ++ NumLoadsA: 5 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 5 ++ ThreadTileA: 20 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16HtWpB6cOJEF4DbwfEMWryn-5b79yyBlfzz8rayx56rA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 38400 ++ LdsInitCVgprs: false ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 128 ++ MacroTileA: 160 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16xLrW7bUYrRxa0twPMedwlsGtIzDt8CoA9mgtie4DWQEA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 36352 ++ LdsInitCVgprs: false ++ LdsNumBytes: 36352 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 86528 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 36352 ++ LdsOffsetMetadata_Blk: 86528 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 5 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16xpypcg-cTfku28ecRt7_t0QgAI6Jt9o4AomShdclcVH8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 53760 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 53760 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 64 ++ MacroTileA: 160 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 5 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 2 ++ ThreadTileA: 20 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16xSFac73JEfRlV1mqS7O-M92uixXuT_kbGgwYBOdp_27E= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26112 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26112 ++ LdsNumElementsAlignedA: 20992 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 20992 ++ LdsOffsetB_Blk: 53760 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 26112 ++ LdsOffsetMetadata_Blk: 53760 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 5 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16491rZwxDr9-AFVEQTsHJ3q65xJ5EMCM65Z8vZi78Eug= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 51200 ++ LdsInitCVgprs: false ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 8] ++ MIWaveTileA: 4 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 256 ++ MacroTileA: 128 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 8 ++ ThreadTileA: 16 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16Lk2FjW8nnBE-JtCicnEFAhrakkaoAwHUa3or9BdVD0M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 52224 ++ LdsInitCVgprs: false ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 224 ++ MacroTileA: 128 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 7 ++ ThreadTileA: 16 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16nwjfAVuWIbYU5PauX6T4Fi5vEc2Darx4wfGDfD5ZHHo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 44032 ++ LdsInitCVgprs: false ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 6] ++ MIWaveTileA: 4 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 192 ++ MacroTileA: 128 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 6 ++ ThreadTileA: 16 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16dPNy9Rb9Nl6qTqYHoV5tFKWSMtN3uJv3wFjiZV2neDo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 41984 ++ LdsInitCVgprs: false ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 160 ++ MacroTileA: 128 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 4 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16cfqw6zxFpDlveHsqm_XmOGZ1imjKBlfWx9UVc47wOko= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 33792 ++ LdsInitCVgprs: false ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 4 ++ ThreadTileA: 16 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16xWhyvJtQoDtQZ25PEIbEP_foBAwSB6Sbmo492I4a3JfU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 31744 ++ LdsInitCVgprs: false ++ LdsNumBytes: 31744 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 49152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 31744 ++ LdsOffsetMetadata_Blk: 49152 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 96 ++ MacroTileA: 128 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16xY70AbRhvdkU43kASQ92-2i6QiEJVpI_MGGxOs2vsIyM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 25600 ++ LdsInitCVgprs: false ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 49152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 49152 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 2 ++ ThreadTileA: 16 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16xwLd5YMjwwbFIip9_0LW4R5y1DRej7vRHi86EbNl8niY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 128 ++ LSCB: 32 ++ LSPA: 8 ++ LSPB: 32 ++ LVCA: 32 ++ LVCB: 8 ++ LVPA: 2 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 21504 ++ LdsInitCVgprs: false ++ LdsNumBytes: 21504 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 49152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 21504 ++ LdsOffsetMetadata_Blk: 49152 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16xE4Eu5sOsW7F7rL4esDcqlR8qMnGMIFCq-NR07Dea7BU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 47616 ++ LdsInitCVgprs: false ++ LdsNumBytes: 47616 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 47616 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 8] ++ MIWaveTileA: 3 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 256 ++ MacroTileA: 96 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 8 ++ ThreadTileA: 12 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16xd8Rp6slimDET3NNvgQwvNYJm0BHB_QcCbAQdChY4WCw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 48640 ++ LdsInitCVgprs: false ++ LdsNumBytes: 48640 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 48640 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 7] ++ MIWaveTileA: 3 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 224 ++ MacroTileA: 96 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 3 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 7 ++ ThreadTileA: 12 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16xtfOPIScvk08BBiRYWyQ3LnIy9bMEIB1Ik_9InBB8M7I= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40448 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 3 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x8-49AEFjkHKWNMFmT5qGjEVAcD_0uSJ89KlANmB-LsY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 38400 ++ LdsInitCVgprs: false ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 3 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16xR180YrqQJnFE8_S_Sa1vEu78mXT-ugMupYpVOWGWL5Y= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 45568 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 45568 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x1jicppHdGecRxok-K3rQfBp_bkCF8W5lBgK9aYLAJ4bg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 28160 ++ LdsInitCVgprs: false ++ LdsNumBytes: 28160 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 45568 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 28160 ++ LdsOffsetMetadata_Blk: 45568 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 3 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x1i-Yr6QbljwiT9h7oNBHxsXh3agLwDOXxd8D7S_ocLNA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22016 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22016 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 45568 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22016 ++ LdsOffsetMetadata_Blk: 45568 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 3 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 2 ++ ThreadTileA: 12 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x1nbtYo-_TiNChE8VtYJM7ZWMB0mBfsTkpKAO9oe_qLNU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 17920 ++ LdsInitCVgprs: false ++ LdsNumBytes: 17920 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 45568 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 17920 ++ LdsOffsetMetadata_Blk: 45568 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 3 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16xo5B-XCjOzO4vBgCWk0cuHMYRuTi52Ow1uzSVWh3vKQo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 43008 ++ LdsInitCVgprs: false ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 73728 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 73728 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 8] ++ MIWaveTileA: 2 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 256 ++ MacroTileA: 64 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 8 ++ ThreadTileA: 8 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16xfj5jWvrG-jMHKy-P9MrCm8PesWQV0QP2Mu8wu26UPQ0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 44032 ++ LdsInitCVgprs: false ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 73728 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 73728 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16xsD0BfB6bbZatG0borGb0Od9EBb53nHK02M9VJG1k5tw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 35840 ++ LdsInitCVgprs: false ++ LdsNumBytes: 35840 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 73728 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 73728 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16xhZoHGJtUHKWTTqSJRm9V5drMDRxKhrLQqAIebM_b8A0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 33792 ++ LdsInitCVgprs: false ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 73728 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 73728 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16xohlCCCU1Xgrn6LbtNU9UsmHhYaq5Gy29VtOpqP_DkiQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 25600 ++ LdsInitCVgprs: false ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 40960 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 40960 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 4] ++ MIWaveTileA: 2 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 2 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 4 ++ ThreadTileA: 8 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x1nPp0GsLw9phVxnXDH6R9l0em80o98wtLKaHP1M2JWMA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 23552 ++ LdsInitCVgprs: false ++ LdsNumBytes: 23552 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 40960 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 23552 ++ LdsOffsetMetadata_Blk: 40960 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 3] ++ MIWaveTileA: 2 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 3 ++ ThreadTileA: 8 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x1SpQEr2l4_ONWaKnnHFklMlWdJ3vr_1LHIhHK5xKMMbc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 17408 ++ LdsInitCVgprs: false ++ LdsNumBytes: 17408 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 40960 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 17408 ++ LdsOffsetMetadata_Blk: 40960 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x1ROukzoS3pzDPIBytOkPOQnc9drTKTwA-AdFTVKJA_oc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 29696 ++ LdsInitCVgprs: false ++ LdsNumBytes: 29696 ++ LdsNumElementsAlignedA: 8192 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 8192 ++ LdsOffsetB_Blk: 24576 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 8192 ++ LdsOffsetMetadata_Blk: 24576 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16xRl27SMrP2QBRVLlvRXM2iFMj-hsaUeC5Jd4EUZQE04o= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 39424 ++ LdsInitCVgprs: false ++ LdsNumBytes: 39424 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 70144 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 39424 ++ LdsOffsetMetadata_Blk: 70144 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 256 ++ MacroTileA: 32 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16xu7hu76KGVPX3CRzJuSqCNwo-5tH48LVKI6AZT1xj21Q= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40448 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 70144 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 70144 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 7] ++ MIWaveTileA: 1 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 1 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16xnlYodV_vh4n4pEJEzWOm9cX3m9aEGpoc2voPsv_MEps= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 32256 ++ LdsInitCVgprs: false ++ LdsNumBytes: 32256 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 37376 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32256 ++ LdsOffsetMetadata_Blk: 37376 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 1 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16xOX3O5QgfYqdzCFMfCiP5YIggLgUAqgkj0sVScGfST4I= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 37376 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 37376 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 5] ++ MIWaveTileA: 1 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 1 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16xdvquVbi41yyyGOysV8bBmZUS0_9HWDMcTCNI1arJ3Xs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22016 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22016 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 37376 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22016 ++ LdsOffsetMetadata_Blk: 37376 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 128 ++ MacroTileA: 32 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x1Neb8Hq4348Hn-aV9exUDY0xhO1hgeG0hYHDMx6_U1Ds= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 19968 ++ LdsInitCVgprs: false ++ LdsNumBytes: 19968 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 37376 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 19968 ++ LdsOffsetMetadata_Blk: 37376 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] ++ MIWaveTileA: 1 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 1 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x19V6B5pUkhhRAqR_cEBdl-Sa1f13Kyvvt9UyEMJZBGgU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 4608 ++ LdsOffsetMetadata_Blk: 20992 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] ++ MIWaveTileA: 1 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 64 ++ MacroTileA: 32 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 1 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x1u0ZhraAPCmtVQGcyz6PLMzm9mZgHy7MgI2WVIcfYqaM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26112 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26112 ++ LdsNumElementsAlignedA: 4608 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 20992 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 4608 ++ LdsOffsetMetadata_Blk: 20992 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 1 ++ ThreadTileA: 4 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16qX06Hd5yRvk7_9ivNNEWS--sTPqXFn7e8iddnWSUBfk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 71168 ++ LdsInitCVgprs: false ++ LdsNumBytes: 71168 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 38400 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 71168 ++ LdsOffsetMetadata_Blk: 163840 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 15] ++ MIWaveTileA: 4 ++ MIWaveTileB: 15 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 240 ++ MacroTileA: 256 ++ MacroTileB: 240 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 8 ++ NumLoadsB: 15 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 15 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 15 ++ ThreadTileA: 16 ++ ThreadTileB: 15 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16EeFaXNL1oG2bh5l4Fg71sF-BVQNUStSzf8w4uY9KMq8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 66048 ++ LdsInitCVgprs: false ++ LdsNumBytes: 66048 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 66048 ++ LdsOffsetMetadata_Blk: 163840 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 13] ++ MIWaveTileA: 4 ++ MIWaveTileB: 13 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 208 ++ MacroTileA: 256 ++ MacroTileB: 208 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 52 ++ NumLoadsA: 8 ++ NumLoadsB: 13 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 13 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 13 ++ ThreadTileA: 16 ++ ThreadTileB: 13 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16IQZr7FaRRc7fBjXLHKqeFgH6SWXZT7aEdbZGcqQW86s= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 126464 ++ LdsInitCVgprs: false ++ LdsNumBytes: 126464 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 28160 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 11] ++ MIWaveTileA: 4 ++ MIWaveTileB: 11 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 176 ++ MacroTileA: 256 ++ MacroTileB: 176 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 44 ++ NumLoadsA: 8 ++ NumLoadsB: 11 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 11 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 11 ++ ThreadTileA: 16 ++ ThreadTileB: 11 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16GteJlC9kGdUpH4vmcGL3gVwSE6tX_Ep9ybUGvpOaS3k= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 121344 ++ LdsInitCVgprs: false ++ LdsNumBytes: 121344 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 23040 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 9] ++ MIWaveTileA: 4 ++ MIWaveTileB: 9 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 144 ++ MacroTileA: 256 ++ MacroTileB: 144 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 8 ++ NumLoadsB: 9 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 9 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 9 ++ ThreadTileA: 16 ++ ThreadTileB: 9 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16F4aPjOb6Fg2uNUySyLVRS9OadOPwn3T9VX17kvBNhx8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 50688 ++ LdsInitCVgprs: false ++ LdsNumBytes: 50688 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 17920 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 50688 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 112 ++ MacroTileA: 256 ++ MacroTileB: 112 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 8 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 7 ++ ThreadTileA: 16 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16xqIXGv9TjE0TjtxfhU0r0VbfnyM4ASRQJm193Fv6Imj0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45568 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45568 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45568 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 80 ++ MacroTileA: 256 ++ MacroTileB: 80 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16xyhelIUiKOVJSBt849n0AUrGV2l7ZCOV75UMYwGWBYmE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40448 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 7680 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16xxwjRPgpoZLBlKLyiiWDX_wmkKsN5P_2DpVWox6VZJuQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 32 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 35328 ++ LdsInitCVgprs: false ++ LdsNumBytes: 35328 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 2560 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 35328 ++ LdsOffsetMetadata_Blk: 98304 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16xe9WAH98wHVCQB2IMP4ScWDxrlwiImgUwONJRbJ4g_Q= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3840_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA15_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3840 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 66048 ++ LdsInitCVgprs: false ++ LdsNumBytes: 66048 ++ LdsNumElementsAlignedA: 31232 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 31232 ++ LdsOffsetB_Blk: 162304 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 66048 ++ LdsOffsetMetadata_Blk: 162304 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [15, 4] ++ MIWaveTileA: 15 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 240 ++ MacroTile1: 256 ++ MacroTileA: 240 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 240 ++ NumLoadsA: 15 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 15 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3840_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA15_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 60 ++ ThreadTile1: 4 ++ ThreadTileA: 60 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16s9AwPPYAgZrdFv1yxVV5BeGZy47glmS2p9huhmIUQ9M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3328_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA13_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 3328 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 127488 ++ LdsInitCVgprs: false ++ LdsNumBytes: 127488 ++ LdsNumElementsAlignedA: 27136 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27136 ++ LdsOffsetB_Blk: 92672 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 27136 ++ LdsOffsetMetadata_Blk: 92672 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [13, 4] ++ MIWaveTileA: 13 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 208 ++ MacroTile1: 256 ++ MacroTileA: 208 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 208 ++ NumLoadsA: 13 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 13 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3328_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA13_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 52 ++ ThreadTile1: 4 ++ ThreadTileA: 52 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16XqWyQPl-C6IYiLfrShVs17FUE-aXyT6pTBjetqaSdbQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2816_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA11_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2816 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 123392 ++ LdsInitCVgprs: false ++ LdsNumBytes: 123392 ++ LdsNumElementsAlignedA: 23040 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 23040 ++ LdsOffsetB_Blk: 88576 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 23040 ++ LdsOffsetMetadata_Blk: 88576 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [11, 4] ++ MIWaveTileA: 11 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 176 ++ MacroTile1: 256 ++ MacroTileA: 176 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 176 ++ NumLoadsA: 11 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 11 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2816_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA11_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 44 ++ ThreadTile1: 4 ++ ThreadTileA: 44 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16UTdTSttfA5rG4lSFnD_Hxu_YMAz2s0OHULcM9w22OiU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2304_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 2304 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 119296 ++ LdsInitCVgprs: false ++ LdsNumBytes: 119296 ++ LdsNumElementsAlignedA: 18944 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 18944 ++ LdsOffsetB_Blk: 84480 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 18944 ++ LdsOffsetMetadata_Blk: 84480 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [9, 4] ++ MIWaveTileA: 9 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 144 ++ MacroTile1: 256 ++ MacroTileA: 144 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 144 ++ NumLoadsA: 9 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 9 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2304_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 36 ++ ThreadTile1: 4 ++ ThreadTileA: 36 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16npNQZ_xDxbcDci4tUEH3gUpL3K9wYsAx94VI6BirQqc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1792_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1792 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 49664 ++ LdsInitCVgprs: false ++ LdsNumBytes: 49664 ++ LdsNumElementsAlignedA: 14848 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 14848 ++ LdsOffsetB_Blk: 80384 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 49664 ++ LdsOffsetMetadata_Blk: 80384 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 112 ++ MacroTile1: 256 ++ MacroTileA: 112 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 7 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1792_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x5k8Eyy7kEQ7rE2P6uwPfhJ1tN_0_b0GyViZfqt2Z690= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1280 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45568 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45568 ++ LdsNumElementsAlignedA: 10752 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 10752 ++ LdsOffsetB_Blk: 76288 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45568 ++ LdsOffsetMetadata_Blk: 76288 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 80 ++ MacroTile1: 256 ++ MacroTileA: 80 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 5 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1280_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x0TaAIUbzNknWzmsVy8BpvzHaerbdS9kDJrrM69B_kOA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 768 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 41472 ++ LdsInitCVgprs: false ++ LdsNumBytes: 41472 ++ LdsNumElementsAlignedA: 6656 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 6656 ++ LdsOffsetB_Blk: 72192 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 41472 ++ LdsOffsetMetadata_Blk: 72192 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16xRfCxqgxLQ7E06lqYULJXwk-u3b7IBC8SrKdgBzV2GN8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 16 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 37376 ++ LdsInitCVgprs: false ++ LdsNumBytes: 37376 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 68096 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 37376 ++ LdsOffsetMetadata_Blk: 68096 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x3SCiEBTunR3sczJ9ZmoaWqYIdHBq_9Xcby6vu_uQgeU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 256 ++ LSCB: 64 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 74752 ++ LdsInitCVgprs: false ++ LdsNumBytes: 74752 ++ LdsNumElementsAlignedA: 65536 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 65536 ++ LdsOffsetB_Blk: 196608 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 74752 ++ LdsOffsetMetadata_Blk: 196608 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 16 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 32 ++ ThreadTile1: 1 ++ ThreadTileA: 32 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16xaz9Rct1y64bDYy3jTu1iNja3ySNSLNFGTourE9Msf-I= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 75776 ++ LdsInitCVgprs: false ++ LdsNumBytes: 75776 ++ LdsNumElementsAlignedA: 58368 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 58368 ++ LdsOffsetB_Blk: 189440 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 75776 ++ LdsOffsetMetadata_Blk: 189440 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 14 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 7 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16xvdtmtLnCrF_9Cs0lH3U64lbRHgLF-U7UB__-VeC1edo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 3584 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 67584 ++ LdsInitCVgprs: false ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 58368 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 58368 ++ LdsOffsetB_Blk: 189440 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 189440 ++ LdsPadA: 16 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 14 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 7 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 1 ++ ThreadTileA: 28 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: false ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16xqhyBKGILah__IHVunxy0oZK5BY5IVSidGP5QTiRbAFQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: false ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 76800 ++ LdsInitCVgprs: false ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 49152 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 49152 ++ LdsOffsetB_Blk: 180224 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 180224 ++ LdsPadA: 0 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 12 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 3 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 8 +- ThreadTileA: 16 +- ThreadTileB: 8 ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4752,23 +18946,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4781,9 +18975,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4792,24 +18986,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32xoUJ1-94jqC77C5LIdMy-coKqYJSKzadR6U3cZVufziU= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16xMp9OKcArPksqWEyAD3PIlx_bYf3SsjBzuAThlXVGVfk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4834,76 +19029,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 +- LSCB: 32 +- LSPA: 4 +- LSPB: 32 +- LVCA: 64 +- LVCB: 8 +- LVPA: 1 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 106624 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 106624 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 49152 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 49152 ++ LdsOffsetB_Blk: 180224 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98304 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 180224 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 256 ++ MacroTile0: 192 + MacroTile1: 64 +- MacroTileA: 256 ++ MacroTileA: 192 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4911,20 +19106,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 12 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4938,12 +19133,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 22 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -4951,16 +19146,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 24 + ThreadTile1: 2 +- ThreadTileA: 32 ++ ThreadTileA: 24 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -4983,16 +19178,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -5005,9 +19200,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5016,24 +19211,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32PE5ctNim5l0VsfCXnU_Lj4hsqbeBcGkIwwc_wNQk3H0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16xhSNSie10EQxA_X-HOEcIk8MMioNzUFXngM9D9wG0HLM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5046,7 +19242,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -5058,48 +19254,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 32 ++ LSCB: 64 + LSPA: 16 +- LSPB: 32 ++ LSPB: 16 + LVCA: 16 +- LVCB: 8 ++ LVCB: 16 + LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 3072 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 123392 ++ LdsBytesNoAmax: 58368 + LdsInitCVgprs: false +- LdsNumBytes: 123392 +- LdsNumElementsAlignedA: 24576 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 58368 ++ LdsNumElementsAlignedA: 49152 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24576 +- LdsOffsetB_Blk: 90112 ++ LdsOffsetB: 49152 ++ LdsOffsetB_Blk: 114688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24576 +- LdsOffsetMetadata_Blk: 90112 ++ LdsOffsetMetadata: 58368 ++ LdsOffsetMetadata_Blk: 114688 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5107,27 +19303,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 +- MacroTile1: 256 ++ MacroTile1: 32 + MacroTileA: 192 +- MacroTileB: 256 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5135,20 +19331,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 +- NumLoadsA: 6 +- NumLoadsB: 8 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5162,30 +19358,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 23 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 4 +- ThreadTileA: 48 +- ThreadTileB: 4 ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5200,23 +19396,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -5229,9 +19425,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5240,90 +19436,91 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32XfDzoSEnT4wLT1M-dSUXiznipwgssptSm93obRCGwKM= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16EmaKnEfBxA8X2m0kCigAtbg10ewcgtEUdVUgX1hOpZc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +- InterleaveAlpha: 0 +- InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, +- SupportUserGSU: false, UseUniversalArgs: true} +- Kernel: true +- KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 +- LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 4 +- LSPB: 8 +- LVCA: 64 +- LVCB: 32 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 116224 ++ LdsBytesNoAmax: 75776 + LdsInitCVgprs: false +- LdsNumBytes: 116224 +- LdsNumElementsAlignedA: 24576 +- LdsNumElementsAlignedB: 26112 ++ LdsNumBytes: 75776 ++ LdsNumElementsAlignedA: 41984 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24576 +- LdsOffsetB_Blk: 90112 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 41984 ++ LdsOffsetB_Blk: 173056 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24576 +- LdsOffsetMetadata_Blk: 90112 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 75776 ++ LdsOffsetMetadata_Blk: 173056 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5331,48 +19528,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 160 ++ MacroTile1: 128 ++ MacroTileA: 160 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 24 +- NumLoadsB: 24 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 10 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 24 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5386,12 +19583,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 24 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -5399,17 +19596,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5425,22 +19622,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -5453,9 +19650,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5464,24 +19661,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x32_MI321UDhXb7FWHcssMvyrd7sKnCXWw5HgEv5S95ZpQhwaV4= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16xEnq6NbjY49t1RRcvA_10sTKEqyOKh42857KiXzXANVA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5506,48 +19704,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 16 +- LSPB: 32 +- LVCA: 16 +- LVCB: 8 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 106752 ++ LdsBytesNoAmax: 69632 + LdsInitCVgprs: false +- LdsNumBytes: 106752 +- LdsNumElementsAlignedA: 24576 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 41984 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24576 +- LdsOffsetB_Blk: 90112 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 41984 ++ LdsOffsetB_Blk: 173056 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24576 +- LdsOffsetMetadata_Blk: 90112 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 173056 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5555,27 +19753,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5583,20 +19781,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 6 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 10 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5604,18 +19802,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 25 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -5623,17 +19821,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5649,22 +19847,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -5679,7 +19877,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5688,24 +19886,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32xQ4Tfe4XtkZ201za7RqlWHc19PQDwzuw6RU3x1DH1d7w= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16xEimTkrrdB-glCT6cmc-RTb6JTMX_va4YdTp01lL0Tp0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5730,36 +19929,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 16 +- LSPB: 32 +- LVCA: 16 +- LVCB: 8 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 33792 ++ LdsBytesNoAmax: 59392 + LdsInitCVgprs: false +- LdsNumBytes: 33792 +- LdsNumElementsAlignedA: 24576 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 59392 ++ LdsNumElementsAlignedA: 41984 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24576 +- LdsOffsetB_Blk: 90112 ++ LdsOffsetB: 41984 ++ LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 90112 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 59392 ++ LdsOffsetMetadata_Blk: 107520 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -5768,10 +19967,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5779,26 +19978,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 192 ++ MacroTile0: 160 + MacroTile1: 64 +- MacroTileA: 192 ++ MacroTileA: 160 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -5807,20 +20006,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 6 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 10 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5834,30 +20033,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 26 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 +- StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 20 ++ ThreadTile1: 2 ++ ThreadTileA: 20 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5873,22 +20072,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -5901,9 +20100,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5912,39 +20111,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI16qfKh-6hX24qirvxhkwRcYoj4CvH0gesdFL3AI4EUioU= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16xtKd1NocuREpqOnna-3kpvsNGZbcIEbNZoPx41P9v_5w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -5954,45 +20154,45 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 32 +- LSPA: 2 +- LSPB: 32 +- LVCA: 128 +- LVCB: 8 +- LVPA: 2 +- LVPB: 8 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2560 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115712 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 115712 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 41984 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetB: 41984 ++ LdsOffsetB_Blk: 107520 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 107520 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 1 +- LoopUnroll: 32 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -6003,14 +20203,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 8] +- MIWaveTileA: 4 +- MIWaveTileB: 8 ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6022,8 +20222,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6031,20 +20231,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 1 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 10 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6052,36 +20252,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 27 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2560_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 8 +- ThreadTileA: 16 +- ThreadTileB: 8 ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6096,8 +20296,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -6105,14 +20305,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -6125,9 +20325,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6136,39 +20336,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI320qJr_ixIH7bLRKS-judrTTEpQc4naPPCIGIjrDkSbDU= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16UdnXum065RHNWMR-wIrQ1y1SFXBCoviTf7Ak2_waSVg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -6178,76 +20379,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 +- LSCB: 32 +- LSPA: 2 +- LSPB: 8 +- LVCA: 128 +- LVCB: 32 ++ LSCB: 64 ++ LSPA: 8 ++ LSPB: 16 ++ LVCA: 32 ++ LVCB: 16 + LVPA: 2 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 108032 ++ LdsBytesNoAmax: 78848 + LdsInitCVgprs: false +- LdsNumBytes: 108032 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 26112 ++ LdsNumBytes: 78848 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 ++ LdsOffsetMetadata: 78848 ++ LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [1, 6] +- MIWaveTileA: 1 +- MIWaveTileB: 6 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 +- MacroTile1: 192 ++ MacroTile1: 160 + MacroTileA: 128 +- MacroTileB: 192 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6255,20 +20456,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 16 +- NumLoadsB: 24 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 24 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6276,26 +20477,26 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 28 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -6303,9 +20504,9 @@ + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 6 ++ ThreadTile1: 5 + ThreadTileA: 16 +- ThreadTileB: 6 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6320,23 +20521,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -6349,9 +20550,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6360,31 +20561,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16cA0KR_fvpsEvSX4Bd1pX_uas6s0yqRXVd0njKICPHl8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16jWEE2t4bMNSOtvsh5Vs18_hvuVqelKjrnrm4hnO3cXM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -6392,7 +20594,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -6402,34 +20604,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 +- LSCB: 32 +- LSPA: 2 +- LSPB: 32 +- LVCA: 128 +- LVCB: 8 ++ LSCB: 64 ++ LSPA: 8 ++ LSPB: 16 ++ LVCA: 32 ++ LVCB: 16 + LVPA: 2 +- LVPB: 8 ++ LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6437,10 +20639,10 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 1 +- LoopUnroll: 32 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -6470,8 +20672,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6479,20 +20681,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6506,12 +20708,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 29 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -6519,7 +20721,7 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -6553,14 +20755,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -6573,9 +20775,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6584,39 +20786,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x8syoe0Fp576ipLudEuwFhHnwocvAu-7bxdIyBKePCUY= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x3BaggAP86JGaEjS6YGxeppyUlOY3TbiSv1dIcF0gwIU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: 0 ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -6626,34 +20829,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 +- LSCB: 32 +- LSPA: 2 +- LSPB: 8 +- LVCA: 128 +- LVCB: 32 ++ LSCB: 64 ++ LSPA: 8 ++ LSPB: 16 ++ LVCA: 32 ++ LVCB: 16 + LVPA: 2 +- LVPB: 8 ++ LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 60416 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 60416 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 ++ LdsOffsetMetadata: 60416 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6663,8 +20866,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 1 +- LoopUnroll: 32 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -6675,14 +20878,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] ++ MIWaveTile: [4, 3] + MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 +- MacroTile1: 64 ++ MacroTile1: 96 + MacroTileA: 128 +- MacroTileB: 64 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6694,7 +20897,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -6703,20 +20906,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6724,18 +20927,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 0 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 30 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -6743,7 +20946,7 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -6751,9 +20954,9 @@ + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 2 ++ ThreadTile1: 3 + ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6769,7 +20972,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -6777,14 +20980,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -6797,9 +21000,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6808,39 +21011,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32xBg7Tred-kexqcoVD-Yj1hUrDSnlbVkZ6Dkl12OuhjZ0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16xJrxor9fQNq5UremgoBPAketDv8I_QPQZ-ThOYMXkhqI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -6850,48 +21054,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 4 +- LSPB: 32 +- LVCA: 64 +- LVCB: 8 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 128 ++ LSCB: 64 ++ LSPA: 8 ++ LSPB: 16 ++ LVCA: 32 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 73728 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 73728 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -6899,27 +21103,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 4] +- MIWaveTileA: 1 +- MIWaveTileB: 4 ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6927,20 +21131,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 64 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 +- NumLoadsB: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6948,36 +21152,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 31 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 4 ++ ThreadTile1: 2 + ThreadTileA: 16 +- ThreadTileB: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6992,23 +21196,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 4 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7023,7 +21227,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7032,24 +21236,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x32_MI32xdhbd_LY5tlXIISIbTp1LMn1aIcaCDInkBERRzM-7SVE= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16xsgr3p3CMK44M_IxbZliUWQ_awJ0YVH0GVMgcXTaps_w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7062,7 +21267,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -7074,36 +21279,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 16 +- LSPB: 32 +- LVCA: 16 +- LVCB: 8 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LSCA: 128 ++ LSCB: 64 ++ LSPA: 8 ++ LSPB: 16 ++ LVCA: 32 ++ LVCB: 16 ++ LVPA: 2 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 35840 +- LdsInitCVgprs: false +- LdsNumBytes: 35840 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 27648 ++ LdsBytesNoAmax: 41984 ++ LdsInitCVgprs: false ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 73728 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 35840 +- LdsOffsetMetadata_Blk: 73728 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -7112,10 +21317,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -7123,26 +21328,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7151,20 +21356,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 2 +- NumLoadsB: 6 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7178,30 +21383,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 32 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 3 ++ ThreadTile1: 1 + ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7216,23 +21421,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7245,9 +21450,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7256,39 +21461,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16xV6W-FzEdwx9X9hnVrKp2DU2RciHEi37vMMr7WTvtAoo= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16xSO2b7nMcNxZLUza2zLSkdDb9iPQ14JdyxYKfxmiZlK8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: 0 ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -7298,35 +21504,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 4 +- LSPB: 8 +- LVCA: 64 +- LVCB: 32 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 1024 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 77824 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 77824 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 156672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 ++ LdsOffsetMetadata: 77824 ++ LdsOffsetMetadata_Blk: 156672 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -7335,8 +21541,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 1 +- LoopUnroll: 32 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -7347,14 +21553,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 4] +- MIWaveTileA: 2 +- MIWaveTileB: 4 ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -7366,7 +21572,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7375,20 +21581,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 16 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 12 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7396,36 +21602,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 0 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 33 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 8 +- ThreadTile1: 4 +- ThreadTileA: 8 +- ThreadTileB: 4 ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7440,8 +21646,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 4 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -7449,14 +21655,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7471,7 +21677,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7480,31 +21686,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x37firzumDIhSG_XOjTv7P5YIkACnXcLqpb8cqDlw6wWM= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16xZIuZr6H100561_77i9lSWgtoHYm1wC4yBIMd_cL_ur0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -7512,7 +21719,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -7522,36 +21729,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 4 +- LSPB: 32 +- LVCA: 64 +- LVCB: 8 +- LVPA: 4 +- LVPB: 8 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 17408 ++ LdsBytesNoAmax: 71680 + LdsInitCVgprs: false +- LdsNumBytes: 17408 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 71680 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 156672 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 71680 ++ LdsOffsetMetadata_Blk: 156672 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -7560,10 +21767,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -7571,26 +21778,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7599,20 +21806,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7620,18 +21827,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 34 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -7639,17 +21846,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7671,16 +21878,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7693,9 +21900,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7704,31 +21911,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x3RDaB4loF34D2VNmijhZjyCW_tFUZnSMDRTfCQDAbI5Q= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16xt1rVys8J1p6wirY9P2v9tFO9Q2kkNKhdA_HxSzyGLho= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -7736,7 +21944,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -7746,116 +21954,116 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 32 +- LSPA: 4 ++ LSCB: 64 ++ LSPA: 32 + LSPB: 16 +- LVCA: 32 +- LVCB: 8 +- LVPA: 4 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 28800 ++ LdsBytesNoAmax: 59392 + LdsInitCVgprs: false +- LdsNumBytes: 28800 +- LdsNumElementsAlignedA: 4096 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 59392 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4096 +- LdsOffsetB_Blk: 20480 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4096 +- LdsOffsetMetadata_Blk: 20480 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 59392 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 35 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -7863,17 +22071,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7889,22 +22097,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7917,9 +22125,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7928,31 +22136,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x3DUVXJ0V85Zfq_1ANNZk7Luo9LkYFgPk7G3PS9FNjn_U= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x1yJotYygzLGSzFxHOmvY1UYw80s4AcSP10dzrGCYIFjI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -7960,7 +22169,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -7970,76 +22179,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 32 +- LSPA: 2 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 + LSPB: 16 +- LVCA: 64 +- LVCB: 8 +- LVPA: 2 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 28736 ++ LdsBytesNoAmax: 53248 + LdsInitCVgprs: false +- LdsNumBytes: 28736 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 4160 ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 24576 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 24576 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -8047,39 +22256,39 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 2 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 2 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 36 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -8087,17 +22296,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8119,16 +22328,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -8143,7 +22352,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8152,24 +22361,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32JlUGZ9I3uZiWkePMLu2jzp9wPZsBpnv3DsUnEXSVxVA= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x1CBrMIQkoy8q9InVwBc72O67uSByrrH0BO9SRpZCq7iU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8182,7 +22392,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -8194,36 +22404,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 32 + LSCB: 64 +- LSPA: 4 ++ LSPA: 32 + LSPB: 16 +- LVCA: 64 ++ LVCA: 8 + LVCB: 16 +- LVPA: 1 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 196608 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 +- LdsOffsetMetadata_Blk: 196608 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8231,38 +22441,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 4] +- MIWaveTileA: 2 +- MIWaveTileB: 4 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 128 +- MacroTileA: 256 +- MacroTileB: 128 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8271,20 +22481,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 16 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 6 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8298,8 +22508,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 37 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8308,20 +22518,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 4 +- ThreadTileA: 32 +- ThreadTileB: 4 ++ ThreadTile0: 12 ++ ThreadTile1: 2 ++ ThreadTileA: 12 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8336,17 +22546,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 4 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8367,7 +22577,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8376,24 +22586,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32x09EgIheOf2JVLGd6BTz4FqtjyWRFGiyLKWEEPS_0RW4= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x1WYpLP-6rkTJhw50cp2ByeyoEBuiJAyYONPFEDNS9jIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8406,7 +22617,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -8418,36 +22629,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 32 + LSCB: 64 +- LSPA: 4 ++ LSPA: 32 + LSPB: 16 +- LVCA: 64 ++ LVCA: 8 + LVCB: 16 +- LVPA: 1 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82432 ++ LdsBytesNoAmax: 34816 + LdsInitCVgprs: false +- LdsNumBytes: 82432 +- LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 196608 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82432 +- LdsOffsetMetadata_Blk: 196608 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8455,38 +22666,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 64 +- MacroTileA: 256 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8495,20 +22706,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 1 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 6 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8516,14 +22727,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 38 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 100 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8532,20 +22743,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8560,17 +22771,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8591,7 +22802,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8600,24 +22811,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32OahhzigQvO0g3Vo5qA8y2YUpQC21WZ2cBoxlHiCPwmw= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16xn-GjHzXZqHuGeiIYWXncwIht8OpT-xsoOXNYzbCEnpQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8630,7 +22842,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -8642,7 +22854,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8652,26 +22864,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 101376 ++ LdsBytesNoAmax: 80896 + LdsInitCVgprs: false +- LdsNumBytes: 101376 +- LdsNumElementsAlignedA: 49152 +- LdsNumElementsAlignedB: 52224 ++ LdsNumBytes: 80896 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 49152 +- LdsOffsetB_Blk: 180224 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 101376 +- LdsOffsetMetadata_Blk: 180224 ++ LdsOffsetMetadata: 80896 ++ LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8679,11 +22891,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8691,26 +22903,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8719,20 +22931,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 12 +- NumLoadsB: 12 +- NumLoadsCoalescedA: 3 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 14 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8746,8 +22958,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 39 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 101 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8756,20 +22968,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8784,17 +22996,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8815,7 +23027,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8824,24 +23036,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32J1YCJtLTPLWGDwgB8AXiiTrrpoHQWqeeTzlJGm3VKUw= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16xVb6wlNINVsV2ZBTDGsvXSY028IgUq_FwcXAMVp71UI0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8854,7 +23067,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -8866,7 +23079,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8876,26 +23089,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82944 ++ LdsBytesNoAmax: 68608 + LdsInitCVgprs: false +- LdsNumBytes: 82944 +- LdsNumElementsAlignedA: 49152 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 49152 +- LdsOffsetB_Blk: 180224 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82944 +- LdsOffsetMetadata_Blk: 180224 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8903,11 +23116,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8915,26 +23128,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8943,20 +23156,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 12 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 12 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8970,8 +23183,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 40 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 102 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8980,20 +23193,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9008,17 +23221,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9039,7 +23252,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9048,24 +23261,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16x99c2l5aFvIM6u7olDGbTtGzObA-j3QJWCgAEiIYwszs= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x4gRTK2qtXXCbsDck4acozyokVdBXRD81tFTx6RKmsN0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9078,7 +23292,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -9090,7 +23304,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9100,25 +23314,25 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 3072 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 67072 ++ LdsBytesNoAmax: 62464 + LdsInitCVgprs: false +- LdsNumBytes: 67072 +- LdsNumElementsAlignedA: 50176 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 62464 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 50176 +- LdsOffsetB_Blk: 181248 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 67072 +- LdsOffsetMetadata_Blk: 181248 +- LdsPadA: 16 ++ LdsOffsetMetadata: 62464 ++ LdsOffsetMetadata_Blk: 81920 ++ LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -9138,15 +23352,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -9158,7 +23372,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9167,20 +23381,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 12 +- NumLoadsB: 4 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 4 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9194,8 +23408,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 41 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 103 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9204,20 +23418,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 16 +- SubGroup1: 16 +- SubGroupA: 16 +- SubGroupB: 16 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 12 +- ThreadTile1: 4 +- ThreadTileA: 12 +- ThreadTileB: 4 ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9232,17 +23446,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9263,7 +23477,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9272,24 +23486,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32SYwe4t8BJePr3n_W-JYVAmMfA1s5kHMFU4dIGNRGrpg= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16xY-oTb_dkghY7hDfWDC66Jb1IQgF7gseaUy4D61STKRE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9314,36 +23529,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 ++ LSCA: 64 + LSCB: 64 +- LSPA: 8 ++ LSPA: 16 + LSPB: 16 +- LVCA: 32 ++ LVCA: 16 + LVCB: 16 +- LVPA: 2 ++ LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 66560 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 163840 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 +- LdsOffsetMetadata_Blk: 163840 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9351,11 +23566,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9367,22 +23582,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9391,20 +23606,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9418,8 +23633,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 42 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 104 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9431,16 +23646,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 4 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9463,10 +23678,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9487,7 +23702,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9496,24 +23711,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32SbmEZHTGfUuO3_uo0O49M_bggbABPXCXLDk_YkTJuBE= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x12o94AT2dQJF1DN4gY69yYG93mJGImd3C2_axBVK9-Mc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9538,36 +23754,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 ++ LSCA: 64 + LSCB: 64 +- LSPA: 8 ++ LSPA: 16 + LSPB: 16 +- LVCA: 32 ++ LVCA: 16 + LVCB: 16 +- LVPA: 2 ++ LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 84992 ++ LdsBytesNoAmax: 44032 + LdsInitCVgprs: false +- LdsNumBytes: 84992 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 52224 ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 163840 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 84992 +- LdsOffsetMetadata_Blk: 163840 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9575,11 +23791,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9591,22 +23807,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9615,20 +23831,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 12 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9642,8 +23858,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 43 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 105 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9655,16 +23871,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 3 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9687,10 +23903,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9711,7 +23927,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9720,24 +23936,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI32aTcu5KS-MyVVdbP1u8BsdCZAbTBbUvv_SwaYMyBH56c= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x1xmOIgYl-E5QvsZVddKhm52AUgqzJYOPAHf8qqA3NXpk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9762,36 +23979,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 ++ LSCA: 64 + LSCB: 64 +- LSPA: 8 ++ LSPA: 16 + LSPB: 16 +- LVCA: 32 ++ LVCA: 16 + LVCB: 16 +- LVPA: 2 ++ LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 66560 ++ LdsBytesNoAmax: 33792 + LdsInitCVgprs: false +- LdsNumBytes: 66560 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 163840 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 66560 +- LdsOffsetMetadata_Blk: 163840 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9799,11 +24016,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9815,22 +24032,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9839,20 +24056,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9866,8 +24083,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 44 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 106 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9879,16 +24096,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 2 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9911,10 +24128,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9933,9 +24150,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9944,24 +24161,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x5wdwgXqUPHNUaDUqxPAyMd-IANb_d8mI1-2B9t5XcWw= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x14HRH_K6vGl6ZIw5JiTY55z831SZha7olN8cCad7lFzY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9974,7 +24192,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -9986,76 +24204,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 ++ LSCA: 64 + LSCB: 64 +- LSPA: 8 ++ LSPA: 16 + LSPB: 16 +- LVCA: 32 ++ LVCA: 16 + LVCB: 16 +- LVPA: 2 ++ LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 114944 ++ LdsBytesNoAmax: 25600 + LdsInitCVgprs: false +- LdsNumBytes: 114944 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 16384 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 16384 ++ LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98304 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [1, 2] +- MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10063,20 +24281,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 4 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10090,8 +24308,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 45 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 107 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10100,20 +24318,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10128,17 +24346,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10159,7 +24377,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10168,24 +24386,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x64_MI32xzgTy7aVdFYigBM6ITIhydFLyr2azrXuSlF6fqEfKA2w= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16xS6tQbHyKF5TLBk0BvmhZnyCOUDHyFEYqWcp-9X51eTg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10198,7 +24417,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -10210,36 +24429,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 64 +- LSPA: 16 ++ LSPA: 32 + LSPB: 16 +- LVCA: 16 ++ LVCA: 8 + LVCB: 16 +- LVPA: 4 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 83968 ++ LdsBytesNoAmax: 76800 + LdsInitCVgprs: false +- LdsNumBytes: 83968 +- LdsNumElementsAlignedA: 16384 ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 147456 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 83968 +- LdsOffsetMetadata_Blk: 147456 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 140288 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -10247,38 +24466,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 256 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -10287,19 +24506,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 ++ NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 ++ NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -10308,14 +24527,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 46 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 108 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10324,20 +24543,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10352,8 +24571,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -10361,8 +24580,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10381,9 +24600,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10392,31 +24611,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x64_MI32xH6LtNO7ac1G9lL5WPikmoG-nqJIcQMzo_-5W9z9kP8c= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16xwIvselTfPxJYbDySXG0ucikS9Fp60hg-MZwQ7_oJYr0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -10424,7 +24644,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -10434,48 +24654,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 64 +- LSPA: 4 ++ LSPA: 32 + LSPB: 16 +- LVCA: 64 ++ LVCA: 8 + LVCB: 16 +- LVPA: 4 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132608 ++ LdsBytesNoAmax: 73728 + LdsInitCVgprs: false +- LdsNumBytes: 132608 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 49920 ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66304 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 82688 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 82688 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 140288 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -10483,27 +24703,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] ++ MIWaveTile: [1, 7] + MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10511,20 +24731,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 +- NonTemporalMetadata: 0 +- NonTemporalWS: 0 +- NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 16 +- NumLoadsB: 12 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10538,30 +24758,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 47 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 109 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10583,10 +24803,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10605,9 +24825,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10616,39 +24836,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16xsse8VZ_erbNyu1QYZQC6fny0m2GMXSNpb-Ey2Iy6fu0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x6K-E6htE1gXaXG0UNSk7TQW7pTA9zKQjZsyVg_osBXw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -10658,35 +24879,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 64 +- LSPA: 4 +- LSPB: 4 +- LVCA: 64 +- LVCB: 64 +- LVPA: 4 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115712 ++ LdsBytesNoAmax: 61440 + LdsInitCVgprs: false +- LdsNumBytes: 115712 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -10707,14 +24928,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 4] +- MIWaveTileA: 2 +- MIWaveTileB: 4 ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10726,7 +24947,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -10735,20 +24956,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 32 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 32 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10756,14 +24977,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 48 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 110 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10772,20 +24993,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 8 +- ThreadTile1: 4 +- ThreadTileA: 8 +- ThreadTileB: 4 ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10800,8 +25021,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 4 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -10809,8 +25030,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10829,9 +25050,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10840,39 +25061,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x15wqw_Br9RibIYSrK27OglB4VJ0OkR9X_qfPPVzFZdm8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16xiZVjKfU9iTk4F5HkKNoqQpl-GQU5gYTKDCMUSEdPJ1k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -10882,43 +25104,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 64 +- LSPA: 4 ++ LSPA: 32 + LSPB: 16 +- LVCA: 64 ++ LVCA: 8 + LVCB: 16 +- LVPA: 4 ++ LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 55296 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 55296 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 ++ LdsOffsetMetadata: 55296 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10931,14 +25153,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveTile: [1, 5] ++ MIWaveTileA: 1 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10950,29 +25172,29 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10980,14 +25202,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 49 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 111 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10996,20 +25218,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 8 +- ThreadTile1: 2 +- ThreadTileA: 8 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11024,8 +25246,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -11033,8 +25255,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11053,9 +25275,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11064,31 +25286,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x3MGDu-d_CL-mF31kuIYgZKJ79utYlyYWmFvz4jSmH59Y= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x1p_pBYAIMki-Hs5G4dJt4X9AF2JaGShjgOlcotzr4LY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -11096,7 +25319,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -11106,98 +25329,98 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 +- LSPA: 4 +- LSPB: 8 +- LVCA: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 + LVCB: 16 +- LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57600 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 57600 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 64 ++ MacroTile1: 128 + MacroTileA: 32 +- MacroTileB: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 ++ NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -11210,8 +25433,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 50 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 112 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11223,17 +25446,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false +- ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11249,16 +25472,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11277,9 +25500,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11288,24 +25511,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x3pWPkYMaGbQRATU6_6u7skA89mAbDw1SklBn1p9LGxDo= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x1qGr5n7Reb1lKZRzNJW4zA6YbGInITp0W-usgAFeY2aM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11313,7 +25537,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -11330,112 +25554,112 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 64 +- LSPA: 8 +- LSPB: 2 +- LVCA: 16 +- LVCB: 64 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57856 ++ LdsBytesNoAmax: 36864 + LdsInitCVgprs: false +- LdsNumBytes: 57856 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 8704 ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 51 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 113 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11447,17 +25671,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11479,10 +25703,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11501,9 +25725,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11512,31 +25736,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x3RNPPVWMuOV-Jl9MY8hX0u9Vga7fQG8E9V5mSKM5MaIw= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x14OObrx2Opdznb1yjFz-ts6JFuAgWzgu-jUTBLXc6t-Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: 0 ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -11544,7 +25769,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -11554,75 +25779,75 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 +- LSPA: 8 ++ LSPA: 32 + LSPB: 16 +- LVCA: 32 ++ LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49664 ++ LdsBytesNoAmax: 26624 + LdsInitCVgprs: false +- LdsNumBytes: 49664 +- LdsNumElementsAlignedA: 8192 +- LdsNumElementsAlignedB: 8704 ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8192 +- LdsOffsetB_Blk: 40960 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8192 +- LdsOffsetMetadata_Blk: 40960 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 1 +- LoopUnroll: 16 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 64 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -11631,20 +25856,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 +- NumLoadsB: 2 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 2 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11652,14 +25877,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 0 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 52 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 114 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11671,17 +25896,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11697,16 +25922,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11727,7 +25952,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11736,24 +25961,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32YvQ5Rwv2TOhS8wMZ8V5RD5919VoSCRBRSsqZouasEnU= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x1cLuZtt440vv5PLGGgsGsAHw_zuuhKuA2xigZulUz230= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11778,36 +26004,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 128 +- LSPA: 16 +- LSPB: 8 +- LVCA: 16 +- LVCB: 32 +- LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 512 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 18432 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 98304 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 18432 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 98304 +- LdsOffsetB_Blk: 360448 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 360448 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 18432 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -11815,11 +26041,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -11827,26 +26053,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -11855,20 +26081,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 24 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 3 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11876,18 +26102,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 53 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 115 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -11895,16 +26121,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 48 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -11927,16 +26153,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -11951,7 +26177,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11960,24 +26186,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x128_MI3gr3CEAI7UjdlhamZ00cy75gV_AtlHVMpaFvwxBQxL_4= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16xYld0PPLNEU6Mhk5XRKt5eI79Zj1L9mSyTiTCvZn0em0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11990,7 +26217,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -12002,36 +26229,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 256 ++ LSCB: 64 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 79360 + LdsInitCVgprs: false +- LdsNumBytes: 132096 ++ LdsNumBytes: 79360 + LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 66560 ++ LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 ++ LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 +- LdsOffsetB_Blk: 327680 ++ LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 327680 ++ LdsOffsetMetadata: 79360 ++ LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12039,38 +26266,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12079,20 +26306,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12100,36 +26327,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 54 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 116 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12144,8 +26371,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -12153,14 +26380,14 @@ + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12175,7 +26402,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12184,24 +26411,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x128_MI324NZdguLVSWLmCPOe0rZE8JlPBUypi1v9hD3xJ5PD_D8= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x9rpswzJMmEtFDRrSNbdzzxDMzIOo9pSY7k05jSlVVAU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12214,7 +26442,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -12226,24 +26454,24 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 256 ++ LSCB: 64 ++ LSPA: 4 ++ LSPB: 16 ++ LVCA: 64 ++ LVCB: 16 ++ LVPA: 1 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 70144 + LdsInitCVgprs: false +- LdsNumBytes: 98816 ++ LdsNumBytes: 70144 + LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -12252,10 +26480,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 ++ LdsOffsetMetadata: 70144 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12263,11 +26491,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -12275,26 +26503,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] +- MIWaveTile: [1, 2] +- MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12303,20 +26531,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 +- NumLoadsB: 8 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12324,36 +26552,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 55 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 117 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 8 +- SubGroup1: 32 +- SubGroupA: 8 +- SubGroupB: 32 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 2 ++ ThreadTile1: 1 + ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12368,23 +26596,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12399,7 +26627,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12408,24 +26636,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32azDILX8C9ZkXlKNyXfiaG3pKaljgDMm-RylzQuFt9Qk= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16xOGbO63-K9jKL-XBUZ_gzGClX6vvuWgOgQP0fAQodk-w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12450,36 +26679,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 128 +- LSPA: 16 +- LSPB: 8 +- LVCA: 16 +- LVCB: 32 +- LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 512 ++ LSCA: 16 ++ LSCB: 64 ++ LSPA: 64 ++ LSPB: 16 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 16 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 768 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 134144 ++ LdsBytesNoAmax: 80896 + LdsInitCVgprs: false +- LdsNumBytes: 134144 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 101376 ++ LdsNumBytes: 80896 ++ LdsNumElementsAlignedA: 13312 ++ LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 294912 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 13312 ++ LdsOffsetB_Blk: 144384 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 134144 +- LdsOffsetMetadata_Blk: 294912 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 80896 ++ LdsOffsetMetadata_Blk: 144384 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12487,38 +26716,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12527,20 +26756,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 24 +- NumLoadsCoalescedA: 1 ++ NumLoadsA: 3 ++ NumLoadsB: 16 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 24 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12548,18 +26777,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 56 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 118 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12567,17 +26796,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12593,22 +26822,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12623,7 +26852,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12632,31 +26861,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x128_MI32DZyKNCFNs59yMd86Bft-OymzPOQHRUdpAcuXL_pf6_A= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16xdgW0ExUJniPQwgDfuX1UQ1XaJWaM9-9sq7ZYI6PMFWo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -12664,7 +26894,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -12674,36 +26904,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 128 +- LSPA: 4 +- LSPB: 8 +- LVCA: 64 +- LVCB: 32 +- LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 0 ++ LSCA: 16 ++ LSCB: 64 ++ LSPA: 64 ++ LSPB: 16 ++ LVCA: 4 ++ LVCB: 16 ++ LVPA: 16 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 72704 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 66560 ++ LdsNumBytes: 72704 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 163840 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 136192 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 +- LdsOffsetMetadata_Blk: 163840 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 72704 ++ LdsOffsetMetadata_Blk: 136192 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12711,38 +26941,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [1, 2] ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [1, 4] + MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12751,19 +26981,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 32 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 32 ++ NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -12778,12 +27008,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 57 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 119 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12791,17 +27021,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12817,22 +27047,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12845,9 +27075,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12856,24 +27086,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32xAZSP2KQbxdxGtuU1vaeDZGO-V0STi-1Zh233iOAXZgI= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16xHgxNzoeNQudugkDWzzo2dY4VYb2daAPC3c93WaZrMKk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12898,48 +27129,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 128 +- LSPA: 16 ++ LSPA: 32 + LSPB: 8 +- LVCA: 16 ++ LVCA: 8 + LVCB: 32 +- LVPA: 4 ++ LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 1536 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 68608 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 51200 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 51200 ++ LdsOffsetB_Blk: 182272 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 182272 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -12947,48 +27178,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 +- NumLoadsCoalescedA: 1 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13002,29 +27233,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 58 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 120 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1536_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 12 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13047,10 +27278,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13069,9 +27300,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13080,39 +27311,40 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32xvmwR5sga7joo1eBCFQvviBjaGYjRo93DNrk3xMVKhuM= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16xbWi-foX_V7f0jEdZsr7UCWnz1fVO258UxcK9HVssUXE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -13122,112 +27354,112 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 32 ++ LSCA: 64 + LSCB: 128 +- LSPA: 4 +- LSPB: 4 +- LVCA: 32 ++ LSPA: 16 ++ LSPB: 8 ++ LVCA: 16 + LVCB: 32 + LVPA: 4 +- LVPB: 1 +- LdsBlockSizePerPadA: 0 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 32768 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 32768 ++ LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +- LocalSplitUReuseLDS: 1 +- LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 32 ++ MacroTile0: 64 + MacroTile1: 64 +- MacroTileA: 32 ++ MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 32 +- NumLoadsB: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 32 +- NumLoadsPerpendicularB: 16 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 59 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 121 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13236,20 +27468,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13264,17 +27496,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13293,9 +27525,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13304,24 +27536,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32xVcGFEopOp1e7OtgJ5QtOtdxmRyNV6l2lGyiVJsTeA34= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x5KPOZZrg6Cnp8M_ooKYcM1XpAqX7xJnpWE92Q2bgNt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13334,7 +27567,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -13346,24 +27579,24 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 128 +- LSPA: 8 +- LSPB: 4 ++ LSPA: 16 ++ LSPB: 8 + LVCA: 16 + LVCB: 32 +- LVPA: 2 +- LVPB: 1 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LVPA: 4 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 114944 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 114944 ++ LdsNumBytes: 50176 + LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 16640 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -13372,31 +27605,31 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 +- LdsPadB: 4 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 +@@ -13408,36 +27641,36 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -13450,8 +27683,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 60 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 122 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13460,19 +27693,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 8 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13488,17 +27721,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13517,9 +27750,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13528,24 +27761,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32xV19bIgwk3wd_tDgkYb6hRPLF-drsrFLXk3mHfVkvrBE= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16xbxz6E9TVCU8Jn6XriFFEvJP8y5q129aqLp6QvQe3Lfw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13570,7 +27804,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 +@@ -13580,87 +27814,87 @@ + LVCB: 32 + LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98560 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 98560 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 18432 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 18432 ++ LdsOffsetB_Blk: 149504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 149504 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 +- LoopUnroll: 32 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 96 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 +- NumLoadsB: 4 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13674,8 +27908,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 61 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 123 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13687,17 +27921,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13719,10 +27953,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13743,7 +27977,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13752,24 +27986,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32xF57LGs_gt47r-HvVpHUT_N0_nbAu20NDPDNd9y14Jgo= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16xJ2-JY58V21cEsWaRNDdPiIe_SHUYTpZKbrAA0RwhMDc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: 0 ++ DepthU: 128 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13794,36 +28029,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 +- LSCB: 256 +- LSPA: 16 +- LSPB: 4 +- LVCA: 16 +- LVCB: 64 +- LVPA: 4 +- LVPB: 1 +- LdsBlockSizePerPadA: 0 ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 52224 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 66560 ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 18432 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 327680 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 18432 ++ LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 327680 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 83968 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -13831,11 +28066,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 16 +- LoopUnroll: 256 ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -13843,26 +28078,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 64 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -13871,20 +28106,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13898,12 +28133,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 62 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 124 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -13911,17 +28146,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13937,22 +28172,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -13965,9 +28200,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13976,24 +28211,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32xWZjnlyXJlmyiWibdW-ivPDDWsCJuZQQPcZ4aOPI0WFo= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x7sqLgjTk8vazvlEmFt0KARjNh7TN1lEs_ZqtI-YFdfg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 128 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14018,55 +28254,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 256 ++ LSCB: 128 + LSPA: 32 +- LSPB: 4 ++ LSPB: 8 + LVCA: 8 +- LVCB: 64 ++ LVCB: 32 + LVPA: 8 +- LVPB: 1 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 1024 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 35840 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 32768 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 35840 ++ LdsNumElementsAlignedA: 18432 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 18432 ++ LdsOffsetB_Blk: 83968 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 83968 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 4 +- LoopUnroll: 64 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14080,35 +28316,35 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14122,29 +28358,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 63 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 125 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14167,16 +28403,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14191,7 +28427,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14200,24 +28436,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32xeKwW1xBcYm6xNAmBD4QjJXt_2Tae3WG5BZzP0VJwaF0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x0LH-gMLeSdAzhVnI8tuJpF_fAKNA79FFRUo0dpuDP_o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 512 +- DirectToLds: 0 ++ DepthU: 256 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14242,55 +28479,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB2048_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 512 ++ LSCB: 256 + LSPA: 32 +- LSPB: 2 ++ LSPB: 4 + LVCA: 8 +- LVCB: 128 ++ LVCB: 64 + LVPA: 8 + LVPB: 1 +- LdsBlockSizePerPadA: 0 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 131584 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 131584 +- LdsNumElementsAlignedA: 65536 +- LdsNumElementsAlignedB: 66048 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 36864 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 327680 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 36864 ++ LdsOffsetB_Blk: 167936 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 131584 +- LdsOffsetMetadata_Blk: 327680 +- LdsPadA: 0 +- LdsPadB: 4 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 167936 ++ LdsPadA: 16 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 +- LoopUnroll: 128 ++ LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14304,35 +28541,35 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14340,18 +28577,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 64 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA0_LBSPPB2048_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 126 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 2048 ++ StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -14359,16 +28596,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14391,16 +28628,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 512 +- _DepthUA: 512 +- _DepthUB: 512 +- _DepthUMetadata: 512 ++ _DepthU: 256 ++ _DepthUA: 256 ++ _DepthUB: 256 ++ _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14415,7 +28652,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14424,24 +28661,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16xnJs-YU_-y5lWcM5l6zck2hREVt-gPkv_f9YWcD-c8CA= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16xRQi3LEuMgTdjWwm0N7An5YAQxWyvqUe1oftcjX9GgUA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14466,7 +28704,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 128 +@@ -14534,7 +28772,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -14542,9 +28780,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -14565,13 +28803,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 65 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 127 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14583,7 +28821,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14617,8 +28855,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14637,9 +28875,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14648,24 +28886,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16xa0A8net-1FZWvJR-NmOx6vu_dWT65KE1aOlSXyu-w5U= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16xsw4RA8-UkhZuliRGe9B3KnqH9kxJ-7zNAu5riw7_G_A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14678,7 +28917,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -14690,7 +28929,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 128 +@@ -14700,33 +28939,33 @@ + LVCB: 32 + LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57600 ++ LdsBytesNoAmax: 27136 + LdsInitCVgprs: false +- LdsNumBytes: 57600 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 8448 ++ LdsNumBytes: 27136 ++ LdsNumElementsAlignedA: 18432 ++ LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 49152 ++ LdsOffsetB: 18432 ++ LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 49152 +- LdsPadA: 0 ++ LdsOffsetMetadata: 27136 ++ LdsOffsetMetadata_Blk: 51200 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -14758,23 +28997,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -14788,14 +29027,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 66 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 128 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14804,10 +29043,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14832,7 +29071,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -14841,8 +29080,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14861,9 +29100,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14872,7 +29111,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16xuzFl02--pbthXGVE5Xar7AmMRXhNAlyXbSeJETmoWXA= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16xoLHWlRfr-LTztstNbLl_ixCQPyeeRLY2uCwnGkakJ8I= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -14883,13 +29122,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14902,7 +29142,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -14914,7 +29154,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 128 +@@ -14927,22 +29167,22 @@ + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 43520 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 32768 ++ LdsNumBytes: 43520 ++ LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 32768 +- LdsOffsetMetadata_Blk: 98304 +- LdsPadA: 0 ++ LdsOffsetMetadata: 43520 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -14982,23 +29222,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -15018,8 +29258,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 67 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 129 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15028,10 +29268,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15056,7 +29296,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15065,8 +29305,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15087,7 +29327,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15096,24 +29336,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x9oldsal-Rk8c1cZlvLcQFfiDFAqXc5uekgnCwmFop8U= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16xgsRVgm9pHBHs85lpCJoTta_SuXVoTQUPOpxzlpicPn8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15138,7 +29379,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 128 +@@ -15149,13 +29390,13 @@ + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 27136 ++ LdsBytesNoAmax: 27648 + LdsInitCVgprs: false +- LdsNumBytes: 27136 ++ LdsNumBytes: 27648 + LdsNumElementsAlignedA: 10240 +- LdsNumElementsAlignedB: 16896 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +@@ -15164,7 +29405,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 27136 ++ LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 + LdsPadB: 8 +@@ -15206,7 +29447,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -15214,9 +29455,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -15237,13 +29478,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 68 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 130 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15255,7 +29496,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15281,7 +29522,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15289,8 +29530,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15309,9 +29550,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15320,24 +29561,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16xIruV_jXuMwzri4A0SJx8UxL3FvzPZSu3BnRvmzDAW58= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x31zu3VqF-oY9wUfJJUo6i6k4GD2brDpwx53Wqmn6lSM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15362,7 +29604,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 128 +@@ -15373,13 +29615,13 @@ + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 109056 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 109056 ++ LdsNumBytes: 45056 + LdsNumElementsAlignedA: 10240 +- LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -15388,7 +29630,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 10240 ++ LdsOffsetMetadata: 45056 + LdsOffsetMetadata_Blk: 75776 + LdsPadA: 16 + LdsPadB: 8 +@@ -15430,17 +29672,17 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -15461,13 +29703,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 69 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 131 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15479,7 +29721,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15505,7 +29747,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15513,8 +29755,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15533,9 +29775,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15544,31 +29786,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16xC_lYSbjx7D7V5IoRAKdVYr-rRroJUV87NZgSpTG6haU= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16xRvXdQ6Uovs1XTsiZgCGu64vVWUM7lcQQQAXTiFbA8OE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -15576,7 +29819,7 @@ + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false +- GuaranteeNoPartialA: true ++ GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] +@@ -15586,43 +29829,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 256 +- LSPA: 16 ++ LSPA: 64 + LSPB: 4 +- LVCA: 16 ++ LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 37376 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 16384 ++ LdsNumBytes: 37376 ++ LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetB: 20480 ++ LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 ++ LdsOffsetMetadata: 37376 ++ LdsOffsetMetadata_Blk: 86016 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -15654,28 +29897,28 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 +- NumLoadsA: 16 ++ NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -15690,8 +29933,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 70 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 132 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15703,7 +29946,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15737,8 +29980,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15759,7 +30002,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15768,24 +30011,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16xIWaCm2yrKCpXSn4Sv5BRojUhr6T-U4GPLZONIoU8p-Y= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16xwpGPpeGbHV8QYIxo2_Gqz4l8DT6DcL9WomNgsRY_DiI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15798,7 +30042,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -15810,7 +30054,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 256 +@@ -15823,22 +30067,22 @@ + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49664 ++ LdsBytesNoAmax: 53760 + LdsInitCVgprs: false +- LdsNumBytes: 49664 +- LdsNumElementsAlignedA: 32768 ++ LdsNumBytes: 53760 ++ LdsNumElementsAlignedA: 36864 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 32768 +- LdsOffsetB_Blk: 98304 ++ LdsOffsetB: 36864 ++ LdsOffsetB_Blk: 102400 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 49664 +- LdsOffsetMetadata_Blk: 98304 +- LdsPadA: 0 ++ LdsOffsetMetadata: 53760 ++ LdsOffsetMetadata_Blk: 102400 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -15878,23 +30122,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 +@@ -15914,8 +30158,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 71 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 133 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15924,10 +30168,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15952,7 +30196,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15961,8 +30205,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15983,7 +30227,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15992,24 +30236,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16xJa2LFu2TIVZSd8zZLKFpZwjJR8veXK46OjVO3GpjDdc= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16xv6hIU0Y_Ka5SRP7TC4DHIUvSEtZK_iPosYeKmy75WsM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16022,7 +30267,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -16034,7 +30279,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 64 + LSCB: 256 +@@ -16047,22 +30292,22 @@ + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82432 ++ LdsBytesNoAmax: 86528 + LdsInitCVgprs: false +- LdsNumBytes: 82432 +- LdsNumElementsAlignedA: 65536 ++ LdsNumBytes: 86528 ++ LdsNumElementsAlignedA: 69632 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 196608 ++ LdsOffsetB: 69632 ++ LdsOffsetB_Blk: 200704 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82432 +- LdsOffsetMetadata_Blk: 196608 +- LdsPadA: 0 ++ LdsOffsetMetadata: 86528 ++ LdsOffsetMetadata_Blk: 200704 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -16102,23 +30347,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 +@@ -16138,8 +30383,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 72 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 134 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16148,10 +30393,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16176,7 +30421,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -16185,8 +30430,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16205,9 +30450,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16216,24 +30461,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16xeqk-j6zURSmOzkrbUy-PG9UoyCxO7gYq2gp_V8IOggA= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16xlEEpXcnDq5dGUXGn2-hH4G_IHfcqacFpVUjaCxu689w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16258,7 +30504,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 256 +@@ -16268,33 +30514,33 @@ + LVCB: 64 + LVPA: 16 + LVPB: 1 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 54272 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 16384 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 54272 ++ LdsNumElementsAlignedA: 20480 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16384 +- LdsOffsetB_Blk: 81920 ++ LdsOffsetB: 20480 ++ LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16384 +- LdsOffsetMetadata_Blk: 81920 +- LdsPadA: 0 ++ LdsOffsetMetadata: 54272 ++ LdsOffsetMetadata_Blk: 86016 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -16326,17 +30572,17 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -16362,8 +30608,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 73 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 135 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16375,7 +30621,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16401,7 +30647,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16409,8 +30655,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16431,7 +30677,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16440,24 +30686,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16xEqh7mf95aagbBjooaIwdoK95ICxnd1TIovb4zMqN9_I= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x6qs8xtGt9vfFjz0MkJkj_0DMSSF_5SYrpXwxVjs3EZI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16482,7 +30729,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 256 +@@ -16493,13 +30740,13 @@ + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 86528 ++ LdsBytesNoAmax: 88064 + LdsInitCVgprs: false +- LdsNumBytes: 86528 ++ LdsNumBytes: 88064 + LdsNumElementsAlignedA: 20480 +- LdsNumElementsAlignedB: 66048 ++ LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -16508,7 +30755,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 86528 ++ LdsOffsetMetadata: 88064 + LdsOffsetMetadata_Blk: 151552 + LdsPadA: 16 + LdsPadB: 8 +@@ -16550,7 +30797,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -16558,9 +30805,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -16586,8 +30833,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 74 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 136 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16599,7 +30846,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16625,7 +30872,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16633,8 +30880,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16655,7 +30902,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16664,24 +30911,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16xmKgZ_gJEy_tBWeVDIoR5-yMYyT8Q6FHLwq5b1QzuSR0= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16xnco8ffhtAb2MQHxpQ3MBfgvf6o6ZQ2C7Kd12hhHcKPM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16706,7 +30954,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 +@@ -16774,7 +31022,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -16782,9 +31030,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -16810,8 +31058,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 75 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 137 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16823,7 +31071,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16857,8 +31105,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -16879,7 +31127,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16888,24 +31136,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16xHN0Ly63TVXAO6oISTU2Gf6zzkmCTHorZDGjLnSEMI2A= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16xf_hX1xlX3shzZyGPL8EDIa_JlE1TZbF1RdfwtrsFMP8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16918,7 +31167,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true +@@ -16930,7 +31179,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 32 + LSCB: 512 +@@ -16943,22 +31192,22 @@ + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 107008 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 65536 ++ LdsNumBytes: 107008 ++ LdsNumElementsAlignedA: 73728 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 65536 +- LdsOffsetB_Blk: 196608 ++ LdsOffsetB: 73728 ++ LdsOffsetB_Blk: 204800 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 +- LdsOffsetMetadata_Blk: 196608 +- LdsPadA: 0 ++ LdsOffsetMetadata: 107008 ++ LdsOffsetMetadata_Blk: 204800 ++ LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -16998,7 +31247,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -17007,14 +31256,14 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 +@@ -17028,14 +31277,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 76 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 138 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17044,10 +31293,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17072,7 +31321,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -17081,8 +31330,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -17103,7 +31352,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -17112,24 +31361,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16xZkZhi1hlOrT8FEaPM8C1jp49SxatvkUuj0o3AgVnsVk= ++ BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16xaQ1MiH0ARu6zBa8zX6ASS3d-qmfx60Fx3z21_NAVnHk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -17154,7 +31404,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 16 + LSCB: 512 +@@ -17165,13 +31415,13 @@ + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 107520 + LdsInitCVgprs: false +- LdsNumBytes: 107008 ++ LdsNumBytes: 107520 + LdsNumElementsAlignedA: 40960 +- LdsNumElementsAlignedB: 66048 ++ LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -17180,7 +31430,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 107008 ++ LdsOffsetMetadata: 107520 + LdsOffsetMetadata_Blk: 172032 + LdsPadA: 16 + LdsPadB: 8 +@@ -17222,7 +31472,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -17230,13 +31480,13 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 +@@ -17258,8 +31508,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 77 +- SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 139 ++ SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17271,7 +31521,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17297,7 +31547,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -17305,8 +31555,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +index f19e330f81..059f3234e5 100644 +--- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml ++++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +@@ -78,9 +78,9 @@ + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +-- - 1LDSBuffer: 0 ++- - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -89,31 +89,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x16_MI32y00xJ2zVcuKwBRZCbdO-qzQAn1-y0P6pIZO5VbWRwGU= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x256x32_MI16_WuRAwV0VWJJ5fLiyhqrcTU6iLjV1QG9-5tt9oz7Eb8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -131,35 +132,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 ++ LSCA: 32 + LSCB: 256 +- LSPA: 16 ++ LSPA: 32 + LSPB: 4 +- LVCA: 16 ++ LVCA: 8 + LVCB: 64 +- LVPA: 16 ++ LVPA: 8 + LVPB: 1 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 67584 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 82944 +- LdsPadA: 4 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 165888 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -169,10 +170,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -180,9 +181,9 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveTile: [8, 8] ++ MIWaveTileA: 8 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 +@@ -193,13 +194,13 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -208,20 +209,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -229,18 +230,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -248,17 +249,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 4 +- ThreadTileA: 64 +- ThreadTileB: 4 ++ ThreadTile0: 32 ++ ThreadTile1: 8 ++ ThreadTileA: 32 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -280,16 +281,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -304,7 +305,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -313,7 +314,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x16_MI32QoEB3pjvKQfy62nlwNUPJa2yUOfAG2jQDsczyYc-Xp8= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16m7o_0OPcYitSd3R8-21zd8FUb8gj8U_dWBwrgKCocls= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -323,22 +324,23 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -346,7 +348,7 @@ + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -355,48 +357,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 16 +- LSPB: 4 +- LVCA: 16 +- LVCB: 64 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 62464 ++ LdsBytesNoAmax: 129536 + LdsInitCVgprs: false +- LdsNumBytes: 62464 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 12288 ++ LdsNumBytes: 129536 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 29184 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 50176 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -404,27 +406,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveTile: [8, 7] ++ MIWaveTileA: 8 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 192 ++ MacroTile1: 224 + MacroTileA: 256 +- MacroTileB: 192 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -432,20 +434,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 16 +- NumLoadsB: 12 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 8 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -453,18 +455,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -472,17 +474,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 3 +- ThreadTileA: 64 +- ThreadTileB: 3 ++ ThreadTile0: 32 ++ ThreadTile1: 7 ++ ThreadTileA: 32 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -504,16 +506,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -526,9 +528,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -537,24 +539,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x16_MI32yUyrrRfoTakEzBpCMDAbc7MP7O_myO2Ke1JSiZj28W8= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16nwK53b_e7iGYhfH5Cnre7fhTH1qjv2BR9mDMbnqfObY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -579,35 +582,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 128 +- LSPA: 64 +- LSPB: 8 +- LVCA: 4 +- LVCB: 32 +- LVPA: 16 +- LVPB: 2 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 25600 ++ LdsBytesNoAmax: 124928 + LdsInitCVgprs: false +- LdsNumBytes: 25600 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 124928 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 25600 +- LdsOffsetMetadata_Blk: 50176 +- LdsPadA: 4 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -617,10 +620,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -628,26 +631,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [8, 6] ++ MIWaveTileA: 8 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 128 ++ MacroTile1: 192 + MacroTileA: 256 +- MacroTileB: 128 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -656,19 +659,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 2 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 192 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 8 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -678,17 +681,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -696,17 +699,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 32 ++ ThreadTile1: 6 ++ ThreadTileA: 32 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -728,16 +731,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -750,9 +753,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -761,24 +764,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x16_MI32x5QxMv_Ca-ga9P-3YimfJGYZF7h9txpWPAWMB8Lv6pEM= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16EXf_4Nj3qYS3o0Pq935qOODLBD0XOWLEoZiRy41Q-9Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -803,36 +807,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 64 +- LSPB: 16 +- LVCA: 4 +- LVCB: 16 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 21504 ++ LdsBytesNoAmax: 121344 + LdsInitCVgprs: false +- LdsNumBytes: 21504 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 121344 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 20992 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 21504 +- LdsOffsetMetadata_Blk: 50176 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -841,10 +845,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -852,26 +856,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 1] +- MIWaveTileA: 4 +- MIWaveTileB: 1 ++ MIWaveTile: [8, 5] ++ MIWaveTileA: 8 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 64 ++ MacroTile1: 160 + MacroTileA: 256 +- MacroTileB: 64 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -880,19 +884,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 +- NumLoadsB: 1 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 8 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -902,17 +906,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -920,17 +924,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 1 +- ThreadTileA: 64 +- ThreadTileB: 1 ++ ThreadTile0: 32 ++ ThreadTile1: 5 ++ ThreadTileA: 32 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -952,16 +956,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -974,9 +978,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -985,24 +989,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x16_MI327_iCjGxW6unUrRmHUyGu8sFzh-ek2ep81J8xkozrs90= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16InAh0azZva8bg-4m_8YMF_lD5_cUaR8pO3YY9knPnm8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1015,7 +1020,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -1027,48 +1032,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 256 +- LSPA: 64 +- LSPB: 4 +- LVCA: 4 +- LVCB: 64 +- LVPA: 16 +- LVPB: 1 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 61632 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 61632 +- LdsNumElementsAlignedA: 12480 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12480 +- LdsOffsetB_Blk: 45248 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12480 +- LdsOffsetMetadata_Blk: 45248 +- LdsPadA: 4 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1076,27 +1081,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 ++ MIWaveTile: [8, 4] ++ MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 256 +- MacroTileA: 192 +- MacroTileB: 256 ++ MacroTile0: 256 ++ MacroTile1: 128 ++ MacroTileA: 256 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1104,19 +1109,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 +- NumLoadsA: 3 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -1125,35 +1130,35 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 32 + ThreadTile1: 4 +- ThreadTileA: 48 ++ ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -1169,23 +1174,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1198,9 +1203,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1209,40 +1214,41 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x16_MI32MA7Y4oydr_vS6P8oyV48F89b342bjP0hARQGbv_yGTk= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16xj1XsK521v3lmSc51W30bhnKjZmMx4Sb1ihNKRpRM2jo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -1251,48 +1257,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 16 +- LSPB: 4 +- LVCA: 16 +- LVCB: 64 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58112 ++ LdsBytesNoAmax: 47616 + LdsInitCVgprs: false +- LdsNumBytes: 58112 +- LdsNumElementsAlignedA: 13056 +- LdsNumElementsAlignedB: 12288 ++ LdsNumBytes: 47616 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 13056 +- LdsOffsetB_Blk: 45824 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 13056 +- LdsOffsetMetadata_Blk: 45824 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 47616 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1300,27 +1306,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 ++ MIWaveTile: [8, 3] ++ MIWaveTileA: 8 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 256 ++ MacroTile1: 96 ++ MacroTileA: 256 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1328,20 +1334,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 12 +- NumLoadsB: 12 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 8 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1350,34 +1356,34 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 32 + ThreadTile1: 3 +- ThreadTileA: 48 ++ ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -1393,23 +1399,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1422,9 +1428,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1433,7 +1439,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x16_MI32oPRzfRwTUfAnmxYbi0PLTYObWGG9Kc9wnXqrPdjpk9c= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16xgESeeOPMQjPj-8e432BUgOgr4-me0OSa7dztMmQ4QNs= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -1443,14 +1449,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1458,15 +1465,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -1475,48 +1482,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 128 +- LSPA: 64 +- LSPB: 2 +- LVCA: 4 +- LVCB: 128 +- LVPA: 16 +- LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53440 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 53440 +- LdsNumElementsAlignedA: 12480 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12480 +- LdsOffsetB_Blk: 45248 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12480 +- LdsOffsetMetadata_Blk: 45248 +- LdsPadA: 4 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1524,27 +1531,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 ++ MIWaveTile: [8, 2] ++ MIWaveTileA: 8 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 64 ++ MacroTileA: 256 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1552,20 +1559,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 3 +- NumLoadsB: 8 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 3 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1580,28 +1587,28 @@ + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 32 + ThreadTile1: 2 +- ThreadTileA: 48 ++ ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -1617,23 +1624,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1646,9 +1653,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1657,24 +1664,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x16_MI32xH4WByFseBcc74FOIM0EkttBykYbw7YTEfN8bu4VhU7I= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x6fb_yNh7J814AhB-IoeqDgQM79vLS9rpyD8P2pJJ0-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1687,7 +1695,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -1699,48 +1707,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 64 +- LSPB: 16 +- LVCA: 4 +- LVCB: 16 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49344 ++ LdsBytesNoAmax: 39424 + LdsInitCVgprs: false +- LdsNumBytes: 49344 +- LdsNumElementsAlignedA: 12480 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 39424 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12480 +- LdsOffsetB_Blk: 45248 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12480 +- LdsOffsetMetadata_Blk: 45248 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 39424 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1748,27 +1756,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1776,19 +1784,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 3 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -1798,34 +1806,34 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 32 + ThreadTile1: 1 +- ThreadTileA: 48 ++ ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -1841,23 +1849,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1872,7 +1880,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1881,24 +1889,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x16_MI32I6JcOrR2cQzFez_1O2F9-Hg-eIauaWZlRAdFPmQxzOw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16zHVr2NWwL_sxXv-0iW7eDX_kRWK9r7_RUz0s-hd7FFM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1911,7 +1920,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -1923,35 +1932,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 ++ LSCA: 32 + LSCB: 256 +- LSPA: 64 ++ LSPA: 32 + LSPB: 4 +- LVCA: 4 ++ LVCA: 8 + LVCB: 64 +- LVPA: 16 ++ LVPA: 8 + LVPB: 1 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 25088 ++ LdsBytesNoAmax: 68608 + LdsInitCVgprs: false +- LdsNumBytes: 25088 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 25088 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 166912 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -1961,37 +1970,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 8] ++ MIWaveTileA: 7 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 128 ++ MacroTile0: 224 + MacroTile1: 256 +- MacroTileA: 128 ++ MacroTileA: 224 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2000,20 +2009,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 2 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 224 ++ NumLoadsA: 7 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2022,35 +2031,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 8 ++ ThreadTileA: 28 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2065,8 +2074,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -2074,14 +2083,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2096,7 +2105,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2105,7 +2114,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x16_MI32MVwy9R05dYWjM_xovzgl3qvq9-3xO7sZ5oLlbTrKHKQ= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16oY4PKlnLzVxxlHSSpNtvh2-1v40k9J5UpyqnyzHIjWo= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -2115,27 +2124,28 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -2147,48 +2157,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53760 ++ LdsBytesNoAmax: 130560 + LdsInitCVgprs: false +- LdsNumBytes: 53760 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 12288 ++ LdsNumBytes: 130560 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 29184 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2196,27 +2206,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 3] +- MIWaveTileA: 2 +- MIWaveTileB: 3 ++ MIWaveTile: [7, 7] ++ MIWaveTileA: 7 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 224 ++ MacroTile1: 224 ++ MacroTileA: 224 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -2224,19 +2234,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 3 ++ NumElementsPerThread: 196 ++ NumGlobalWriteVectorsPerThread: 196 ++ NumLoadsA: 7 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 8 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -2252,29 +2262,29 @@ + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 3 +- ThreadTileA: 32 +- ThreadTileB: 3 ++ ThreadTile0: 28 ++ ThreadTile1: 7 ++ ThreadTileA: 28 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2289,23 +2299,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2320,7 +2330,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2329,40 +2339,41 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x16_MI32Anhca7WPTeiLzKfj71LtLVyUxJWZf6K5svF4hyjVq7E= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16tVuI7koiBydJXLPSJDLEAQAW_b2Kw5wOb22zde2Q8sU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -2371,35 +2382,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 128 +- LSPA: 16 +- LSPB: 2 +- LVCA: 16 +- LVCB: 128 +- LVPA: 16 +- LVPB: 2 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 50176 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 50176 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 41984 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 9216 +- LdsOffsetMetadata_Blk: 41984 +- LdsPadA: 4 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -2409,10 +2420,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2420,26 +2431,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveTile: [7, 6] ++ MIWaveTileA: 7 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 192 ++ MacroTileA: 224 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2448,20 +2459,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 168 ++ NumLoadsA: 7 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2470,35 +2481,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 6 ++ ThreadTileA: 28 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2513,23 +2524,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2544,7 +2555,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2553,7 +2564,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x16_MI32xlvvFwOssU0lvu1RZ70cjZmvj6jK5lboItHpVcwW8wzI= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16vs-zDqEHE514PBL7feVFubih_VJ-QJjarKvtYpslTDg= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -2563,30 +2574,31 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -2595,36 +2607,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 16 +- LSPB: 4 +- LVCA: 16 +- LVCB: 64 +- LVPA: 16 +- LVPB: 4 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 29696 ++ LdsBytesNoAmax: 122368 + LdsInitCVgprs: false +- LdsNumBytes: 29696 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 122368 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 20992 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 25600 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 9216 +- LdsOffsetMetadata_Blk: 25600 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2633,10 +2645,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2644,26 +2656,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 1] +- MIWaveTileA: 2 +- MIWaveTileB: 1 ++ MIWaveTile: [7, 5] ++ MIWaveTileA: 7 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 224 ++ MacroTile1: 160 ++ MacroTileA: 224 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2672,20 +2684,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 4 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 7 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2693,36 +2705,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 1 +- ThreadTileA: 32 +- ThreadTileB: 1 ++ ThreadTile0: 28 ++ ThreadTile1: 5 ++ ThreadTileA: 28 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2737,23 +2749,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2768,7 +2780,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2777,37 +2789,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x16_MI32xoauA4iubGAT8tV02DRVZFYHP3uoXfC3SO20MdrPt2Yw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16gGpbMwh1OSFqrMrhySm1RwtFGS1qmOjeva7wQ9xZ_L0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -2819,35 +2832,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 256 +- LSPA: 16 +- LSPB: 4 +- LVCA: 16 +- LVCB: 64 +- LVPA: 16 +- LVPB: 1 ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 20992 ++ LdsBytesNoAmax: 52224 + LdsInitCVgprs: false +- LdsNumBytes: 20992 +- LdsNumElementsAlignedA: 4608 ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 37376 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 20992 +- LdsOffsetMetadata_Blk: 37376 +- LdsPadA: 4 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -2857,37 +2870,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 224 ++ MacroTile1: 128 ++ MacroTileA: 224 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2896,19 +2909,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -2918,35 +2931,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2961,8 +2974,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -2970,14 +2983,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2990,9 +3003,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3001,7 +3014,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x16_MI32x1snbRadADcH3VLWxDA1lvUeDK50bhOU55_dlfKgiU_w= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16xPRJj0zz7hN0by3_iNEvut3rrr_kE0akuizAZBVJ06M4= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -3011,21 +3024,22 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -3043,48 +3057,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49408 ++ LdsBytesNoAmax: 48640 + LdsInitCVgprs: false +- LdsNumBytes: 49408 +- LdsNumElementsAlignedA: 4352 +- LdsNumElementsAlignedB: 12288 ++ LdsNumBytes: 48640 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4352 +- LdsOffsetB_Blk: 37120 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4352 +- LdsOffsetMetadata_Blk: 37120 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 48640 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3092,27 +3106,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 ++ MIWaveTile: [7, 3] ++ MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 224 ++ MacroTile1: 96 ++ MacroTileA: 224 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -3120,19 +3134,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 4 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 7 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -3148,11 +3162,11 @@ + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3160,16 +3174,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 28 + ThreadTile1: 3 +- ThreadTileA: 16 ++ ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -3192,16 +3206,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3216,7 +3230,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3225,24 +3239,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x16_MI32xDDmcrUJrf3x-Jns3NCDxT1F9HD_sL74uISu4sIo7rRg= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x_ytLjFx-td-HxGNrEds4I1ViLXPH7ivWzceom3SQcLU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3255,7 +3270,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -3267,35 +3282,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 128 +- LSPA: 64 +- LSPB: 8 +- LVCA: 4 +- LVCB: 32 +- LVPA: 16 +- LVPB: 2 ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 12800 ++ LdsBytesNoAmax: 44032 + LdsInitCVgprs: false +- LdsNumBytes: 12800 +- LdsNumElementsAlignedA: 4608 ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 35840 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 20992 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12800 +- LdsOffsetMetadata_Blk: 20992 +- LdsPadA: 4 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -3305,37 +3320,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 1] +- MIWaveTileA: 2 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -3344,19 +3359,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 1 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 7 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -3366,35 +3381,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 1 +- ThreadTileA: 32 +- ThreadTileB: 1 ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3409,8 +3424,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -3418,14 +3433,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3440,7 +3455,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3449,24 +3464,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x16_MI32x3n-K4-w7IdmMUW0caDoBVMWL-MHBa7YJFEmEdTzYQU8I= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16xG47IluMl13C2YjfoiRIZoUNtlk5qHgOdZdD79UgBJ8Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3474,7 +3490,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -3482,7 +3498,7 @@ + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -3491,36 +3507,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 +- LSPA: 64 +- LSPB: 4 +- LVCA: 4 +- LVCB: 64 +- LVPA: 16 +- LVPB: 4 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 8704 ++ LdsBytesNoAmax: 40448 + LdsInitCVgprs: false +- LdsNumBytes: 8704 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 20992 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 +- LdsOffsetMetadata_Blk: 20992 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3529,10 +3545,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3540,26 +3556,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -3567,21 +3583,21 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 1 +- NumLoadsB: 4 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 7 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3589,18 +3605,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3608,16 +3624,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 28 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -3640,16 +3656,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3662,9 +3678,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3673,7 +3689,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x16_MI32x3qCwjJMrREEOH_vPGitnGSi8CBPSjWQMRKS3V293RRFc= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI169Dgt81FDS4l_J0zh8r6IsxMVoHzTZv73uJEFOV3QvLA= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -3683,14 +3699,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3698,15 +3715,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -3715,35 +3732,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 64 ++ LSCA: 32 ++ LSCB: 256 + LSPA: 32 +- LSPB: 2 +- LVCA: 4 ++ LSPB: 4 ++ LVCA: 8 + LVCB: 64 + LVPA: 8 +- LVPB: 2 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 6400 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 6400 +- LdsNumElementsAlignedA: 2304 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 2304 +- LdsOffsetB_Blk: 10496 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 6400 +- LdsOffsetMetadata_Blk: 10496 +- LdsPadA: 4 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -3753,96 +3770,96 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 8] ++ MIWaveTileA: 6 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 256 ++ MacroTileA: 192 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 1 ++ NumElementsPerThread: 192 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 8 ++ ThreadTileA: 24 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3857,23 +3874,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3888,7 +3905,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3897,24 +3914,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x16_MI32x35NmXKdzc2h4jXrnOc_GkG4uzXYlvVU_AnECIGL6PeWw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16FJrG4YvH-bdVmTqDv0uSp8V1nebgwI9zbMs14R1awNY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3922,15 +3940,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -3939,36 +3957,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 ++ LSCA: 32 + LSCB: 32 + LSPA: 32 +- LSPB: 4 +- LVCA: 4 +- LVCB: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 14848 ++ LdsBytesNoAmax: 122368 + LdsInitCVgprs: false +- LdsNumBytes: 14848 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 2048 ++ LdsNumBytes: 122368 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 29184 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 12800 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4608 +- LdsOffsetMetadata_Blk: 12800 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3977,37 +3995,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 7] ++ MIWaveTileA: 6 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 192 ++ MacroTile1: 224 ++ MacroTileA: 192 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4015,58 +4033,58 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 2 +- NumLoadsB: 4 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 6 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 +- NumThreads: 128 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 7 ++ ThreadTileA: 24 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4081,23 +4099,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4110,9 +4128,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4121,24 +4139,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x32_MI32_HBEY3NdobJNe96nadGHMz1UQpqr5ky8PrB8jc8ynCQ= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16aVuUCfhX0bmiOhKrJLyecn2cygdMCVesb72GRSOqrb8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4146,15 +4165,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -4163,35 +4182,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 256 ++ LSCB: 64 + LSPA: 32 +- LSPB: 1 ++ LSPB: 16 + LVCA: 8 +- LVCB: 256 ++ LVCB: 16 + LVPA: 8 +- LVPB: 1 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 66560 ++ LdsBytesNoAmax: 117760 + LdsInitCVgprs: false +- LdsNumBytes: 66560 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 117760 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 164864 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 66560 +- LdsOffsetMetadata_Blk: 164864 +- LdsPadA: 4 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -4200,11 +4219,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -4212,26 +4231,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveTile: [6, 6] ++ MIWaveTileA: 6 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 256 +- MacroTileA: 256 +- MacroTileB: 256 ++ MacroTile0: 192 ++ MacroTile1: 192 ++ MacroTileA: 192 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4240,20 +4259,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 256 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 8 +- NumLoadsB: 32 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 32 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4261,14 +4280,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4277,20 +4296,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 4 +- ThreadTileA: 64 +- ThreadTileB: 4 ++ ThreadTile0: 24 ++ ThreadTile1: 6 ++ ThreadTileA: 24 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4305,17 +4324,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4334,9 +4353,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4345,24 +4364,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x32_MI16WjtdTFKxJRvp2QENX4hEFETLpZBd2pZ27wBywIjLBiA= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16nqTf3MkVM53IYeQmQAbI-8oOpZE9QympKk0UIngc4A0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4375,7 +4395,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -4387,43 +4407,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 ++ LSCB: 32 + LSPA: 32 +- LSPB: 16 ++ LSPB: 32 + LVCA: 8 +- LVCB: 16 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 3072 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 123904 ++ LdsBytesNoAmax: 48640 + LdsInitCVgprs: false +- LdsNumBytes: 123904 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 24576 ++ LdsNumBytes: 48640 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 20992 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 99328 ++ LdsOffsetMetadata: 48640 ++ LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -4436,14 +4456,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [8, 6] +- MIWaveTileA: 8 +- MIWaveTileB: 6 ++ MIWaveTile: [6, 5] ++ MIWaveTileA: 6 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 192 +- MacroTileA: 256 +- MacroTileB: 192 ++ MacroTile0: 192 ++ MacroTile1: 160 ++ MacroTileA: 192 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -4455,8 +4475,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4464,20 +4484,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 6 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4486,13 +4506,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 20 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4501,20 +4521,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 6 +- ThreadTileA: 32 +- ThreadTileB: 6 ++ ThreadTile0: 24 ++ ThreadTile1: 5 ++ ThreadTileA: 24 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4529,8 +4549,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -4538,8 +4558,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4558,9 +4578,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4569,24 +4589,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x32_MI32w8zIpRVVlSi0IZuoEGG-xQ5Z_2xTllFtVsJaX5py5mU= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16snUjadt2Jxlwj818k83D3p8uFBr6RTUAktClrTeJQFM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4599,7 +4620,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -4611,7 +4632,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 +@@ -4621,38 +4642,38 @@ + LVCB: 32 + LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 44032 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 4 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -4660,27 +4681,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [6, 4] ++ MIWaveTileA: 6 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 256 ++ MacroTile0: 192 + MacroTile1: 128 +- MacroTileA: 256 ++ MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4688,19 +4709,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -4709,14 +4730,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 21 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4725,20 +4746,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 24 ++ ThreadTile1: 4 ++ ThreadTileA: 24 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4753,17 +4774,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4782,9 +4803,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4793,24 +4814,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x32_MI32xnWrouSd4ARhx9FU4JX1oMm8LlAgGYjTGbW33neXAGnE= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16xxHByrjIjGbFBdNlNJkySGE_JqxIC2jqETWDOHWv5eHE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4818,15 +4840,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -4835,48 +4857,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 ++ LSCB: 32 + LSPA: 32 +- LSPB: 4 ++ LSPB: 32 + LVCA: 8 +- LVCB: 64 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 40448 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -4884,27 +4906,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 1] +- MIWaveTileA: 4 +- MIWaveTileB: 1 ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 64 +- MacroTileA: 256 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4912,20 +4934,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4934,13 +4956,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 22 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4949,20 +4971,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 1 +- ThreadTileA: 64 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4977,17 +4999,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5006,9 +5028,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5017,24 +5039,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x32_MI32GA4n0DvweiAS_6WVGbvjA3kNQx0nI4TvHkUbqsQ7BTs= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16xli5c05z6exXHwe4moQ0yTN-Q0LxqyTH2uUmEO6BeJ34= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5047,7 +5070,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -5059,48 +5082,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 256 ++ LSCB: 64 + LSPA: 32 +- LSPB: 4 ++ LSPB: 16 + LVCA: 8 +- LVCB: 64 ++ LVCB: 16 + LVPA: 8 +- LVPB: 1 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 123264 ++ LdsBytesNoAmax: 35840 + LdsInitCVgprs: false +- LdsNumBytes: 123264 +- LdsNumElementsAlignedA: 24960 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 35840 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24960 +- LdsOffsetB_Blk: 90496 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24960 +- LdsOffsetMetadata_Blk: 90496 +- LdsPadA: 4 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5108,27 +5131,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 +- MacroTile1: 256 ++ MacroTile1: 64 + MacroTileA: 192 +- MacroTileB: 256 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5136,20 +5159,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 +- NumLoadsB: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5158,13 +5181,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 23 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5173,20 +5196,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 4 +- ThreadTileA: 48 +- ThreadTileB: 4 ++ ThreadTile0: 24 ++ ThreadTile1: 2 ++ ThreadTileA: 24 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5201,17 +5224,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5230,9 +5253,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5241,37 +5264,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x32_MI32io4-0OesrhJ0eXRJ8UIbHDDxdnDWhNYZn0zrgvg_Yoc= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x-NSFH3xnoCrlVguSOBxx1C0ThoaVV8et1DCbLeHU-dk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -5283,48 +5307,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 +- LSPA: 8 +- LSPB: 16 +- LVCA: 32 +- LVCB: 16 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 ++ LVPB: 8 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 116224 ++ LdsBytesNoAmax: 32256 + LdsInitCVgprs: false +- LdsNumBytes: 116224 +- LdsNumElementsAlignedA: 26112 +- LdsNumElementsAlignedB: 24576 ++ LdsNumBytes: 32256 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 26112 +- LdsOffsetB_Blk: 91648 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 26112 +- LdsOffsetMetadata_Blk: 91648 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 32256 ++ LdsOffsetMetadata_Blk: 60416 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5332,27 +5356,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 +- MacroTile1: 192 ++ MacroTile1: 32 + MacroTileA: 192 +- MacroTileB: 192 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5360,20 +5384,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 24 +- NumLoadsB: 6 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 6 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 24 +- NumLoadsPerpendicularB: 2 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5382,13 +5406,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 24 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5397,20 +5421,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5425,17 +5449,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5456,7 +5480,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5465,24 +5489,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32PPUSibPVnXIY4-Bs9Xj_kLI2R6e8ToLyWA9cli7GtHw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16tlNy7-tmyUe5c-SUiecJ6lI7H-jk1Zc8e1ysepWKNSI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5490,7 +5515,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -5498,7 +5523,7 @@ + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -5507,48 +5532,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 128 ++ LSCB: 256 + LSPA: 32 +- LSPB: 2 ++ LSPB: 4 + LVCA: 8 +- LVCB: 128 ++ LVCB: 64 + LVPA: 8 +- LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 106880 ++ LdsBytesNoAmax: 123904 + LdsInitCVgprs: false +- LdsNumBytes: 106880 +- LdsNumElementsAlignedA: 24960 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 123904 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24960 +- LdsOffsetB_Blk: 90496 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24960 +- LdsOffsetMetadata_Blk: 90496 +- LdsPadA: 4 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5556,27 +5581,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [5, 8] ++ MIWaveTileA: 5 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 160 ++ MacroTile1: 256 ++ MacroTileA: 160 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5584,20 +5609,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 6 +- NumLoadsB: 16 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 160 ++ NumLoadsA: 5 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5606,13 +5631,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 25 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5624,17 +5649,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 8 ++ ThreadTileA: 20 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5650,16 +5675,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5678,9 +5703,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5689,24 +5714,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x32_MI32xOPwHhPVu5Ehlxvb26Izz5raTPScTObcefmQWDA7JgW4= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16g5htufMzsAiqQ0wjt2huHni6mmvEFpIRCqNd9h4zteo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5731,36 +5757,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 ++ LSCB: 32 + LSPA: 32 +- LSPB: 16 ++ LSPB: 32 + LVCA: 8 +- LVCB: 16 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 35840 ++ LdsBytesNoAmax: 120320 + LdsInitCVgprs: false +- LdsNumBytes: 35840 +- LdsNumElementsAlignedA: 27648 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 120320 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 29184 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 27648 +- LdsOffsetB_Blk: 93184 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 35840 +- LdsOffsetMetadata_Blk: 93184 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -5768,11 +5794,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5780,26 +5806,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [5, 7] ++ MIWaveTileA: 5 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 160 ++ MacroTile1: 224 ++ MacroTileA: 160 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -5808,20 +5834,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 6 +- NumLoadsB: 2 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 5 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 2 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5829,14 +5855,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 26 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5848,17 +5874,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 20 ++ ThreadTile1: 7 ++ ThreadTileA: 20 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5880,10 +5906,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5902,9 +5928,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5913,37 +5939,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x32_MI32ebYcHG-B6I9DKCbULmBcpSfMrQxsYItXRoDNtryNdmw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI167KfnJ8VcUnoJEHDqmFlaiZn8eXZfL18y5X6cEE1EnIA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -5955,76 +5982,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 256 +- LSPA: 8 +- LSPB: 4 +- LVCA: 32 +- LVCB: 64 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 + LVPA: 8 +- LVPB: 1 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 +- LdsPadA: 4 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 6] ++ MIWaveTileA: 5 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 160 ++ MacroTile1: 192 ++ MacroTileA: 160 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6032,20 +6059,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 120 ++ NumLoadsA: 5 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6054,13 +6081,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 27 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6069,20 +6096,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 6 ++ ThreadTileA: 20 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6097,7 +6124,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -6106,8 +6133,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6126,9 +6153,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6137,7 +6164,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI16all-L95D0JmVj9xLcpW-817bdMAXWl27n3446t8rgig= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16fJ0lRhOB6_VVxN7Vxtrdu6xT4mI2ogZtEpbr4Q9IcV4= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -6148,29 +6175,30 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -6179,43 +6207,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 +- LSPA: 8 +- LSPB: 4 +- LVCA: 32 +- LVCB: 64 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 3072 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107520 ++ LdsBytesNoAmax: 46592 + LdsInitCVgprs: false +- LdsNumBytes: 107520 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 24576 ++ LdsNumBytes: 46592 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 20992 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 82944 ++ LdsOffsetMetadata: 46592 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -6227,15 +6255,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [8, 3] +- MIWaveTileA: 8 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 5] ++ MIWaveTileA: 5 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 160 ++ MacroTile1: 160 ++ MacroTileA: 160 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6247,8 +6275,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6256,20 +6284,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 24 +- NumLoadsA: 16 +- NumLoadsB: 24 ++ NumElementsPerThread: 100 ++ NumGlobalWriteVectorsPerThread: 100 ++ NumLoadsA: 5 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6277,14 +6305,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 28 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6293,20 +6321,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 3 +- ThreadTileA: 32 +- ThreadTileB: 3 ++ ThreadTile0: 20 ++ ThreadTile1: 5 ++ ThreadTileA: 20 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6321,17 +6349,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6350,9 +6378,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6361,37 +6389,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI16iCuPvLLfX4nK_IlPaVpcFkVrUR5tWpzgFmnF8cudo7A= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16qsqvg8l3BCVcfEjJwEfviXlv25OXeGY6-y_aV8VYM0g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -6403,34 +6432,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 +- LSPA: 8 ++ LSPA: 32 + LSPB: 8 +- LVCA: 32 ++ LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 41984 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 17408 ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 82944 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 +@@ -6438,8 +6467,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -6452,13 +6481,13 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 ++ MacroTile0: 160 + MacroTile1: 128 +- MacroTileA: 128 ++ MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 +@@ -6471,8 +6500,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6480,19 +6509,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -6502,13 +6531,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 29 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6517,19 +6546,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 20 + ThreadTile1: 4 +- ThreadTileA: 16 ++ ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -6545,7 +6574,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -6554,8 +6583,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6574,9 +6603,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6585,7 +6614,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x32_MI16x6JzQNqHo08hcIYAyDDY0Lawpb6taHMiYvJ4xZtHBjFE= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16xd5ZRGXxeivNcDjC5DdxR_InQIDkiBj4IlDkLZNRh48M= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -6596,29 +6625,30 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -6627,36 +6657,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 +- LSPA: 8 +- LSPB: 4 +- LVCA: 32 +- LVCB: 64 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 1024 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 38400 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 12800 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 50176 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -6676,14 +6706,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6695,7 +6725,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -6704,20 +6734,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 5 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6725,14 +6755,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 30 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6741,20 +6771,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6769,8 +6799,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -6778,8 +6808,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6798,9 +6828,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6809,37 +6839,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x32_MI32xhR-RD-yOL47rdu5mRtt2Y4TfxxpyC9tmgKTFYVobhVo= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16xFFKAM9t8-wR26fXJkO13Btej8d5cNemNvhkvB2b6KEE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -6851,76 +6882,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 256 +- LSPA: 8 +- LSPB: 4 +- LVCA: 32 +- LVCB: 64 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 + LVPA: 8 +- LVPB: 1 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 33792 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 74240 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 +- LdsOffsetMetadata_Blk: 74240 +- LdsPadA: 4 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 160 ++ MacroTile1: 64 ++ MacroTileA: 160 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6928,20 +6959,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 5 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6949,14 +6980,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 31 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6965,19 +6996,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 20 + ThreadTile1: 2 +- ThreadTileA: 32 ++ ThreadTileA: 20 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -6993,7 +7024,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -7002,8 +7033,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7024,7 +7055,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7033,7 +7064,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x32_MI16xXIKNcIPDUuyi7uopkDKEGgu_QyGxeRtjhf33FNEQ9VM= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x0d5VjmFlIAKABTgiK3GmLbUmV1-a3YoPpglp4lrlAQU= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -7044,13 +7075,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7063,7 +7095,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -7075,34 +7107,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 ++ LSCB: 32 + LSPA: 32 +- LSPB: 16 ++ LSPB: 32 + LVCA: 8 +- LVCB: 16 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 3072 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 33792 ++ LdsBytesNoAmax: 30208 + LdsInitCVgprs: false +- LdsNumBytes: 33792 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 25088 ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 74240 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 74240 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 58368 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 +@@ -7123,16 +7155,16 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 +- MagicDivAlg: 2 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 ++ MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 +@@ -7143,7 +7175,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7152,20 +7184,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 12 +- NumLoadsA: 2 +- NumLoadsB: 6 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 5 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 2 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7173,14 +7205,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 32 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7189,20 +7221,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7217,17 +7249,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7246,9 +7278,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7257,7 +7289,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI16xa9rxkUClGFoKWWP1-eb88m0DeTetbTb9Ez-aU5oA7zs= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16Y9Ui3-IsPz3Cwb9NkHpY92w75j8HR1U4DzS89pG8pno= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -7268,29 +7300,30 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -7299,34 +7332,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 128 +- LSPA: 8 +- LSPB: 2 +- LVCA: 32 +- LVCB: 128 ++ LSCB: 256 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 64 + LVPA: 8 +- LVPB: 2 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 2048 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 58368 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 41984 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 9216 +- LdsOffsetMetadata_Blk: 41984 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 +@@ -7348,14 +7381,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 4] +- MIWaveTileA: 2 +- MIWaveTileB: 4 ++ MIWaveTile: [4, 8] ++ MIWaveTileA: 4 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 128 ++ MacroTile1: 256 ++ MacroTileA: 128 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -7367,7 +7400,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7376,20 +7409,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7397,14 +7430,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 33 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7413,20 +7446,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 8 +- ThreadTile1: 4 +- ThreadTileA: 8 +- ThreadTileB: 4 ++ ThreadTile0: 16 ++ ThreadTile1: 8 ++ ThreadTileA: 16 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7441,7 +7474,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -7450,8 +7483,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7472,7 +7505,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7481,24 +7514,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x32_MI32x3pMWVj_qOhEWwCSOkgIhHnLfx80G7wzMJwZ9hnyU5yyA= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16gezbiGieIb43SnojHg7zVH8Bt26ePf3vzJbFQmdkRDM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7506,15 +7540,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -7523,36 +7557,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 +- LSCB: 64 ++ LSCB: 32 + LSPA: 32 +- LSPB: 4 ++ LSPB: 32 + LVCA: 8 +- LVCB: 64 ++ LVCB: 8 + LVPA: 8 +- LVPB: 4 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 17408 ++ LdsBytesNoAmax: 46592 + LdsInitCVgprs: false +- LdsNumBytes: 17408 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 46592 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 29184 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 41984 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 41984 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 46592 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -7560,11 +7594,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -7572,26 +7606,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 128 ++ MacroTile1: 224 ++ MacroTileA: 128 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7600,20 +7634,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 2 +- NumLoadsB: 8 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7621,14 +7655,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 34 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7637,20 +7671,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 1 ++ ThreadTile1: 7 + ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7665,17 +7699,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7696,7 +7730,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7705,40 +7739,41 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x32_MI32x3qLpZGV0leZ1DPlK4kcokeaSoWNLG2J6vi3TjQSxjVD0= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16cAQ6FdSA5UF15I3F5jjoR8vVjba3F2MsEJSHoPWFiPI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -7747,35 +7782,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 64 +- LSPA: 4 +- LSPB: 2 +- LVCA: 32 +- LVCB: 64 +- LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 0 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 12800 ++ LdsBytesNoAmax: 41984 + LdsInitCVgprs: false +- LdsNumBytes: 12800 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 20992 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12800 +- LdsOffsetMetadata_Blk: 20992 +- LdsPadA: 4 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -7784,38 +7819,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 6] ++ MIWaveTileA: 4 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 128 ++ MacroTile1: 192 ++ MacroTileA: 128 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7823,36 +7858,36 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 +- NumThreads: 128 +- NumWaveSplitK: 1 +- OptNoLoadLoop: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 35 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7861,20 +7896,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 1 ++ ThreadTile1: 6 + ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7889,17 +7924,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7918,9 +7953,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7929,24 +7964,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x32_MI32x3FwnsqRyP-ytqhkmGe7Q5r1BLct3FOgY_6yFmmmqynMQ= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16ilFr2Yd8vLjFlb0eOFb12Cbl2ZhZuAlCRih0bZbbl1Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7954,15 +7990,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -7971,112 +8007,112 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 16 +- LSPB: 4 ++ LSPA: 32 ++ LSPB: 32 + LVCA: 8 +- LVCB: 32 +- LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 28800 ++ LdsBytesNoAmax: 38400 + LdsInitCVgprs: false +- LdsNumBytes: 28800 +- LdsNumElementsAlignedA: 8320 +- LdsNumElementsAlignedB: 4096 ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 20992 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 8320 +- LdsOffsetB_Blk: 24704 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8320 +- LdsOffsetMetadata_Blk: 24704 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 128 ++ MacroTile1: 160 ++ MacroTileA: 128 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 +- NumLoadsB: 8 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 ++ NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 36 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8085,20 +8121,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 1 ++ ThreadTile1: 5 + ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8113,17 +8149,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -8144,7 +8180,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8153,24 +8189,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x64_MI32E2DzxcLCzsde5PzQhmiMRdxlRZWNZmW4Qw391GQAg8k= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16qO0AvmkoBBx18Bssg3z8uwpuZl6C_pzXEw_Ad8bIIa4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 64 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8195,35 +8232,14210 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 64 ++ LSCA: 32 + LSCB: 128 +- LSPA: 16 ++ LSPA: 32 + LSPB: 8 +- LVCA: 16 ++ LVCA: 8 + LVCB: 32 +- LVPA: 4 ++ LVPA: 8 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 33792 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 197632 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 4 ++ ThreadTileA: 16 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16xx5Z8V5BPfyPQz8QPoVHOvgESW3lc9EQx6Yzc7zw4cnE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 96 ++ MacroTileA: 128 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16xePdejsis5ugBSlSN91euaJpYOV60UlYEdS6LKEvytAw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 25600 ++ LdsInitCVgprs: false ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 8192 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 2 ++ ThreadTileA: 16 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16xzsh_E-apV-xqGF4i2PjpqxP1JJN0R1uzW3pkYC4Pfjg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22016 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22016 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 4608 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22016 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x0ZdzI_qBV8ezATQdBmurPYOagqXEIlWJv835dgsJK_g= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 48128 ++ LdsInitCVgprs: false ++ LdsNumBytes: 48128 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 48128 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 8] ++ MIWaveTileA: 3 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 256 ++ MacroTileA: 96 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 8 ++ ThreadTileA: 12 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x0YP5VKu0CiXHDzn3M412zvUkq-MiSc4RhO_5ktU6D48= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3584 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 44544 ++ LdsInitCVgprs: false ++ LdsNumBytes: 44544 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 29184 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 44544 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 7] ++ MIWaveTileA: 3 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 224 ++ MacroTileA: 96 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 3 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 7 ++ ThreadTileA: 12 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16xyia6B9VrMnnN7PHQHAS9ZrMMqKbk0K3cZSCSJlEed0o= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3072 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 39936 ++ LdsInitCVgprs: false ++ LdsNumBytes: 39936 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 24576 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 39936 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 3 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16xn4X9P7j6l6K3y1FQeILqL4oTOeShdX_jX1kmj_-hFSk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 2560 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 36352 ++ LdsInitCVgprs: false ++ LdsNumBytes: 36352 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 20992 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 36352 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 3 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16xj31dD1n1JswYHlFjxUg7QJNG1aOqXEE2Gucws3a47Fs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 31744 ++ LdsInitCVgprs: false ++ LdsNumBytes: 31744 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 31744 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x13Z4pPJ_-ie-punYtulcVJednc_Arp1Fa5xRT5lfiVrQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 28160 ++ LdsInitCVgprs: false ++ LdsNumBytes: 28160 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 28160 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 3 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x10V1zRxPpdzz0OqyMHALzUVfAgxjK1-l98mBFz7yX18k= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 23552 ++ LdsInitCVgprs: false ++ LdsNumBytes: 23552 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 8192 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 23552 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 3 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 2 ++ ThreadTileA: 12 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x1oAfWTYZjhHwr0KugXVizEMvKVN_J9qoN0Ah7zJPHm7M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 19968 ++ LdsInitCVgprs: false ++ LdsNumBytes: 19968 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 4608 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 19968 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 3 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16xeKi1y-69aQXWb3trV1d8fy0yb7-tcSTylHOVVqS_-J0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 41984 ++ LdsInitCVgprs: false ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 8] ++ MIWaveTileA: 2 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 256 ++ MacroTileA: 64 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 8 ++ ThreadTileA: 8 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16xBt9cF1NRSeMo9Dtc2zW0pnNPMphcX71p50rH_0Synw8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3584 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 38400 ++ LdsInitCVgprs: false ++ LdsNumBytes: 38400 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 29184 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 38400 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16xvo_lLMIvqWoDjtGzeBHX3okezcrr6RfkhyqHQKPsOAE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3072 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 33792 ++ LdsInitCVgprs: false ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 24576 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16xR5tliZHLYZPmjghhuf5SkD2igje62a8bv_Y0Bw252Rw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2560 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 20992 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30208 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16xHW2b2I7MFxTPKQbMmxoEl6aXsVUg--roYwMQuVbSBtA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 25600 ++ LdsInitCVgprs: false ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 4] ++ MIWaveTileA: 2 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 2 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 4 ++ ThreadTileA: 8 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x1IS5PlgR3i6KQOInm3VHXHz0qrrf9StmJZVmyZpMZ0N4= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22016 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22016 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22016 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 3] ++ MIWaveTileA: 2 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 3 ++ ThreadTileA: 8 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x1XFMBxcQpu9Gv9k0ZjZKGYH2Sxkr9gJs2V9mYAA1SC-A= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 17408 ++ LdsInitCVgprs: false ++ LdsNumBytes: 17408 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 8192 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 17408 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x1-adWG7ABiaVBbgv_48y6RZUCJ_NNQx4xQp9eTBZKaVE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30208 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30208 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 4608 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 25600 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 9216 ++ LdsOffsetMetadata_Blk: 25600 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16xRrHHNIQ_S2QJn5iKa-KLlCMEoJQeCXU5DKBNXXjleqQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 32 ++ LSPB: 4 ++ LVCA: 8 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 37888 ++ LdsInitCVgprs: false ++ LdsNumBytes: 37888 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 70656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 37888 ++ LdsOffsetMetadata_Blk: 70656 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 256 ++ MacroTileA: 32 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16xgSmZKmoiMJ8srEpAlLjxEdlTofGW-tLulyyDDeGE1Lo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3584 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 34304 ++ LdsInitCVgprs: false ++ LdsNumBytes: 34304 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 29184 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 70656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34304 ++ LdsOffsetMetadata_Blk: 70656 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 7] ++ MIWaveTileA: 1 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 1 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16xEmqnjEf30hakvtwhnxr53cQj4Rk5l04FzKwUrxgo_Qs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 3072 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 29696 ++ LdsInitCVgprs: false ++ LdsNumBytes: 29696 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 24576 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 29696 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 1 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16xcmoYlTgB9nMz5vMcsfGJft9qsZ9yuPm-oPHyqlkq984= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 2560 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26112 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26112 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 20992 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 26112 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 5] ++ MIWaveTileA: 1 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 1 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16xbGQ01k_WvsJB2zTHclqGooS2vi0FTrWjcuKwD-HVvro= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 128 ++ LSPA: 32 ++ LSPB: 8 ++ LVCA: 8 ++ LVCB: 32 ++ LVPA: 8 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 21504 ++ LdsInitCVgprs: false ++ LdsNumBytes: 21504 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 21504 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 128 ++ MacroTileA: 32 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x1jvZTqpxgox0APYfcxlg_msMFTPRgA45IsyrYkpEJts8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 17920 ++ LdsInitCVgprs: false ++ LdsNumBytes: 17920 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 17920 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] ++ MIWaveTileA: 1 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 1 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x15bh8s909jRmCVWBspwpALNS60DdW5IX9_IPo12qGh1Y= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 64 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 29696 ++ LdsInitCVgprs: false ++ LdsNumBytes: 29696 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 8192 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 5120 ++ LdsOffsetMetadata_Blk: 21504 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] ++ MIWaveTileA: 1 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 64 ++ MacroTileA: 32 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 1 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x1nM8xD1Pj5KrnOCjBeDJkcRuL3ae90qGcM4SThQB9Yrc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26112 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26112 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 4608 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 5120 ++ LdsOffsetMetadata_Blk: 21504 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 1 ++ ThreadTileA: 4 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16kJ2OPyT8iUbbtJgI4IrkFjNT52CGL232A1aNFgaE1Ic= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3840_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB15_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3840 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 66048 ++ LdsInitCVgprs: false ++ LdsNumBytes: 66048 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 31232 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 165888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 66048 ++ LdsOffsetMetadata_Blk: 165888 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 15] ++ MIWaveTileA: 4 ++ MIWaveTileB: 15 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 240 ++ MacroTileA: 256 ++ MacroTileB: 240 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 8 ++ NumLoadsB: 15 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 15 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3840_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB15_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 15 ++ ThreadTileA: 16 ++ ThreadTileB: 15 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16kFC52s8JsOWZLcFyb8eBgS-Xbno1_gXguShAgQdaDB0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3328_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB13_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3328 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 127488 ++ LdsInitCVgprs: false ++ LdsNumBytes: 127488 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 27136 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 13] ++ MIWaveTileA: 4 ++ MIWaveTileB: 13 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 208 ++ MacroTileA: 256 ++ MacroTileB: 208 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 52 ++ NumLoadsA: 8 ++ NumLoadsB: 13 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 13 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3328_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB13_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 13 ++ ThreadTileA: 16 ++ ThreadTileB: 13 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16LrqCPUKZTtys6k_vc4aBUusKgI-eaf78qHOz6_lX3Dg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2816_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB11_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2816 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 123392 ++ LdsInitCVgprs: false ++ LdsNumBytes: 123392 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 23040 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 11] ++ MIWaveTileA: 4 ++ MIWaveTileB: 11 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 176 ++ MacroTileA: 256 ++ MacroTileB: 176 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 44 ++ NumLoadsA: 8 ++ NumLoadsB: 11 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 11 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2816_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB11_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 11 ++ ThreadTileA: 16 ++ ThreadTileB: 11 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16v9wGR45l790Tvffo9NYX9O0ElwxfKlImPlAH119VuVc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2304_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB9_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2304 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 119296 ++ LdsInitCVgprs: false ++ LdsNumBytes: 119296 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 18944 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 9] ++ MIWaveTileA: 4 ++ MIWaveTileB: 9 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 144 ++ MacroTileA: 256 ++ MacroTileB: 144 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 8 ++ NumLoadsB: 9 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 9 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2304_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB9_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 9 ++ ThreadTileA: 16 ++ ThreadTileB: 9 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16GCTCYVwrUuj8pVlRaM2qAK-o-wT68HOLpQitoIaFbu8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1792_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1792 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 49664 ++ LdsInitCVgprs: false ++ LdsNumBytes: 49664 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 14848 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 49664 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 112 ++ MacroTileA: 256 ++ MacroTileB: 112 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 8 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1792_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 7 ++ ThreadTileA: 16 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16xdvSU57HVE-HBm5lYFNRUQaFNPW1-sxOLOGiaJac4bUM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1280 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45568 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45568 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 10752 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45568 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 80 ++ MacroTileA: 256 ++ MacroTileB: 80 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1280_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16xFYK1IdkxToXplSiQG5MSbN6RIeqnM_uQg17FKxWcVKA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 768 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 41472 ++ LdsInitCVgprs: false ++ LdsNumBytes: 41472 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 6656 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 41472 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB768_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16xmalmdorBlPywwbZMH8AAk9WMWszyLIC0-dLVQVjt3w0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 16 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 37376 ++ LdsInitCVgprs: false ++ LdsNumBytes: 37376 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 2560 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 37376 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16oUALzJsgjZhBTFG4gL3DPdBB-Jmpslts1LgP2N1NKgY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 71168 ++ LdsInitCVgprs: false ++ LdsNumBytes: 71168 ++ LdsNumElementsAlignedA: 38400 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 38400 ++ LdsOffsetB_Blk: 169472 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 71168 ++ LdsOffsetMetadata_Blk: 169472 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [15, 4] ++ MIWaveTileA: 15 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 240 ++ MacroTile1: 256 ++ MacroTileA: 240 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 240 ++ NumLoadsA: 15 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 15 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 60 ++ ThreadTile1: 4 ++ ThreadTileA: 60 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16xUWSyhCPICuN__qbnbpG7V8BHVKp4HsHAMyQ_VoFLFU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 66048 ++ LdsInitCVgprs: false ++ LdsNumBytes: 66048 ++ LdsNumElementsAlignedA: 33280 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33280 ++ LdsOffsetB_Blk: 164352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 66048 ++ LdsOffsetMetadata_Blk: 164352 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [13, 4] ++ MIWaveTileA: 13 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 208 ++ MacroTile1: 256 ++ MacroTileA: 208 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 208 ++ NumLoadsA: 13 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 13 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 52 ++ ThreadTile1: 4 ++ ThreadTileA: 52 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16gJovFvGffz4vn_WBjMFxGXVRQOfyHCjGdxTRO4NZwDo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 126464 ++ LdsInitCVgprs: false ++ LdsNumBytes: 126464 ++ LdsNumElementsAlignedA: 28160 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 28160 ++ LdsOffsetB_Blk: 93696 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 28160 ++ LdsOffsetMetadata_Blk: 93696 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [11, 4] ++ MIWaveTileA: 11 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 176 ++ MacroTile1: 256 ++ MacroTileA: 176 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 176 ++ NumLoadsA: 11 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 11 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 44 ++ ThreadTile1: 4 ++ ThreadTileA: 44 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16jktI8_QSXQs1gWwpTYJnmLCjFtG4cQdbG9wIjpsVbv0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 121344 ++ LdsInitCVgprs: false ++ LdsNumBytes: 121344 ++ LdsNumElementsAlignedA: 23040 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 23040 ++ LdsOffsetB_Blk: 88576 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 23040 ++ LdsOffsetMetadata_Blk: 88576 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [9, 4] ++ MIWaveTileA: 9 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 144 ++ MacroTile1: 256 ++ MacroTileA: 144 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 144 ++ NumLoadsA: 9 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 9 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 36 ++ ThreadTile1: 4 ++ ThreadTileA: 36 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16rHnO0ZXJGd-rtNtuEbh1UXKGmLOF2M1IfMjh2NKGuWo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 50688 ++ LdsInitCVgprs: false ++ LdsNumBytes: 50688 ++ LdsNumElementsAlignedA: 17920 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17920 ++ LdsOffsetB_Blk: 83456 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 50688 ++ LdsOffsetMetadata_Blk: 83456 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 112 ++ MacroTile1: 256 ++ MacroTileA: 112 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16xDGLN0qute_TFL4mbNJtep3rCIfXgbJqloR4YYRrWyyo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45568 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45568 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45568 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 80 ++ MacroTile1: 256 ++ MacroTileA: 80 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16xHDWwahzrjorpalLymFrXw1b6_n1Uie9aCFwRIDG4cxo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40448 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40448 ++ LdsNumElementsAlignedA: 7680 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 7680 ++ LdsOffsetB_Blk: 73216 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40448 ++ LdsOffsetMetadata_Blk: 73216 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x0l9zL69Q207_D_tUJfBqxjx6yo7A-IOtjnwioecCJ2U= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 8 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 35328 ++ LdsInitCVgprs: false ++ LdsNumBytes: 35328 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 68096 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 35328 ++ LdsOffsetMetadata_Blk: 68096 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16xFJ90K4qqU6sLtC9Blb6gCzoKgIal1BuCP4uTAKdLLpQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 76800 ++ LdsInitCVgprs: false ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 198656 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 16 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 32 ++ ThreadTile1: 1 ++ ThreadTileA: 32 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16xUwnkKSI6XT4HQGWmUvJ3W5MtkzQKVOvymn4hbFUNEII= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 80896 ++ LdsInitCVgprs: false ++ LdsNumBytes: 80896 ++ LdsNumElementsAlignedA: 64512 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 64512 ++ LdsOffsetB_Blk: 195584 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 80896 ++ LdsOffsetMetadata_Blk: 195584 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 14 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 14 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16xJNDkcZa01KXGE5BlL8wa-TzGCHAAV0lHjTqWe0RGjpo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73728 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 64512 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 64512 ++ LdsOffsetB_Blk: 195584 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 195584 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 14 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 14 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 1 ++ ThreadTileA: 28 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16xbQIhMgMv7Lm6K6woiPOUjhY65VDB5PpZsYcAEo74NxE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 77824 ++ LdsInitCVgprs: false ++ LdsNumBytes: 77824 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 77824 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 12 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x4qt_NZCjpNB8jVYuwBs3o6c5I9GEFwlG_9t763rG40M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 68608 ++ LdsInitCVgprs: false ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 64 ++ MacroTileA: 192 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 12 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 2 ++ ThreadTileA: 24 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x4yiwaZ0IJTeJl_EFPJfJEtSi-6QzC_0Pzqvn42ssLB4= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 61440 ++ LdsInitCVgprs: false ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 117760 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 117760 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 32 ++ MacroTileA: 192 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI164XslkVYc_u_Aii42xKW1MtvEeqSXnUTG6GLIC1EwTjc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 32 ++ LVPA: 4 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 78848 ++ LdsInitCVgprs: false ++ LdsNumBytes: 78848 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 177152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 78848 ++ LdsOffsetMetadata_Blk: 177152 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 128 ++ MacroTileA: 160 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 10 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x3Q72zKC3P5_RuVCY9_Dprtt-6rhnyOJ4_5uzZ9AIt5I= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 71680 ++ LdsInitCVgprs: false ++ LdsNumBytes: 71680 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 177152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 71680 ++ LdsOffsetMetadata_Blk: 177152 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 10 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16xsyzBYgVA9Dgtr5YeyE01dWt9HhKyCY_qvowfA3bWzPE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 62464 ++ LdsInitCVgprs: false ++ LdsNumBytes: 62464 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 111616 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 62464 ++ LdsOffsetMetadata_Blk: 111616 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 64 ++ MacroTileA: 160 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 10 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 2 ++ ThreadTileA: 20 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16xmH7kgh0HCR6aUzflLUBTm1weavqwcYh7TVSveTuPOeA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 55296 ++ LdsInitCVgprs: false ++ LdsNumBytes: 55296 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 111616 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 55296 ++ LdsOffsetMetadata_Blk: 111616 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 10 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16_wWNaEE3T0bXt2JLYslV-RpfwA5jCOdhPQbSynxL1sk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 2560 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 75776 ++ LdsInitCVgprs: false ++ LdsNumBytes: 75776 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 41984 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 75776 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 160 ++ MacroTileA: 128 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16UYYuELKABJdS-ALYVPFyvjJu-S5mP3Uz6BxiICTr7wY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 32 ++ LVPA: 4 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 66560 ++ LdsInitCVgprs: false ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 4 ++ ThreadTileA: 16 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16xe-iTN_0l1omDuXzwLKYmdsAmGXlnKUwaO77lmF556Lo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 59392 ++ LdsInitCVgprs: false ++ LdsNumBytes: 59392 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 59392 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 96 ++ MacroTileA: 128 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16xyOheKkC_GNW4jHwuyqcMQiJ-TMkQUQyYcBYlRyoG7Yg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 50176 ++ LdsInitCVgprs: false ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 2 ++ ThreadTileA: 16 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16xf3Iv0r_qhkX8K5fjR8jcEg-uPg2Q2RAgaOeTNObdTjQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 43008 ++ LdsInitCVgprs: false ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16xiFXaX01qnLm-fQeIWK8KHSC9sOIhLOOYw3Me2K_Xhu8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3072 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 76800 ++ LdsInitCVgprs: false ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 49152 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 158720 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 158720 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 12 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x-L9-y2K0IIdJE5MKH-8Po1BveimD_o9YaP1WqLa343Y= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2560 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 69632 ++ LdsInitCVgprs: false ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 41984 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 158720 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 158720 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16xHvLNISj__E9GJPd3z4sSZTR38gAa1Be0G19P49IBk-8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 128 ++ LSPA: 16 ++ LSPB: 8 ++ LVCA: 16 ++ LVCB: 32 ++ LVPA: 4 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 60416 ++ LdsInitCVgprs: false ++ LdsNumBytes: 60416 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 60416 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 0 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x1HM91OeSTBD5z9DSNmOHhr54sMKChoDD2khGGIzAP5cQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1536 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 53248 ++ LdsInitCVgprs: false ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 16 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: false ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x1ELEI5Avw4uK1D7Zt_DsIIInwzQjxFHWPmyS8p1vZi-I= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: false ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 44032 ++ LdsInitCVgprs: false ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 +- LdsOffsetMetadata_Blk: 197632 +- LdsPadA: 4 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -8232,11 +22444,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8244,26 +22456,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 128 +- MacroTileA: 256 +- MacroTileB: 128 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8272,20 +22484,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 6 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8299,8 +22511,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 37 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8309,19 +22521,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 ++ ThreadTile0: 12 + ThreadTile1: 2 +- ThreadTileA: 64 ++ ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -8337,17 +22549,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8368,7 +22580,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8377,24 +22589,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x64_MI32xbeNDfeD9akeSbV2pqGlI84vUBp_5xqbx9v8VJ73uWsU= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x1iLPprFWa7jo-fCVrowux-6RxAegV8CtBnlhE2qSuTUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8407,7 +22620,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -8419,36 +22632,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 ++ LSCB: 32 + LSPA: 16 +- LSPB: 16 ++ LSPB: 32 + LVCA: 16 +- LVCB: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 83968 ++ LdsBytesNoAmax: 36864 + LdsInitCVgprs: false +- LdsNumBytes: 83968 +- LdsNumElementsAlignedA: 67584 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 67584 +- LdsOffsetB_Blk: 198656 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 83968 +- LdsOffsetMetadata_Blk: 198656 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8456,38 +22669,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 64 +- MacroTileA: 256 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8496,20 +22709,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 6 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8523,8 +22736,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 38 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 100 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8533,20 +22746,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8561,17 +22774,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8592,7 +22805,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8601,24 +22814,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x64_MI323Ngz3BZrYdlJL5Cr6H3XuOzLxb_QoOhW6ecXusKTEQ8= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16xgLpZvQhIjOAl7JZQfciO8CP-smbZm-1r17ZIKtB9rEY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8631,7 +22845,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -8643,36 +22857,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 ++ LSCB: 32 + LSPA: 16 +- LSPB: 16 ++ LSPB: 32 + LVCA: 16 +- LVCB: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 101376 ++ LdsBytesNoAmax: 75776 + LdsInitCVgprs: false +- LdsNumBytes: 101376 +- LdsNumElementsAlignedA: 52224 +- LdsNumElementsAlignedB: 49152 ++ LdsNumBytes: 75776 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 58368 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 52224 +- LdsOffsetB_Blk: 183296 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 101376 +- LdsOffsetMetadata_Blk: 183296 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 75776 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8680,11 +22894,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8692,26 +22906,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8720,20 +22934,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 12 +- NumLoadsB: 12 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 14 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8747,8 +22961,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 39 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 101 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8757,20 +22971,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8785,17 +22999,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8816,7 +23030,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8825,24 +23039,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI32d8vQmX_zN58tMuugvv69NwL5x9siL9nXvGfZEuizdM8= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16xKO45fIND6bwnyP1jXI7i6Q2lw-V-sXM_2VYM5Fc_Ghs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8855,7 +23070,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -8867,35 +23082,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 128 ++ LSCB: 64 + LSPA: 16 +- LSPB: 8 ++ LSPB: 16 + LVCA: 16 +- LVCB: 32 ++ LVCB: 16 + LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 84992 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 84992 +- LdsNumElementsAlignedA: 52224 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 52224 +- LdsOffsetB_Blk: 183296 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 84992 +- LdsOffsetMetadata_Blk: 183296 +- LdsPadA: 4 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -8904,11 +23119,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8916,26 +23131,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8944,20 +23159,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 12 +- NumLoadsB: 8 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8971,8 +23186,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 40 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 102 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8981,20 +23196,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 +- StreamK: 3 +- StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9009,17 +23224,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9040,7 +23255,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9049,24 +23264,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x64_MI32x7yaegZogzqaj785XzP0oCgKErthq_KGZMMVl2rZCnLE= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16xJsQ0Dds9bslyAQt7uNWKhI94HWYFZQWIms7cMn8hLUU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9079,7 +23295,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -9091,36 +23307,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 ++ LSCB: 32 + LSPA: 16 +- LSPB: 16 ++ LSPB: 32 + LVCA: 16 +- LVCB: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 68608 ++ LdsBytesNoAmax: 59392 + LdsInitCVgprs: false +- LdsNumBytes: 68608 +- LdsNumElementsAlignedA: 52224 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 59392 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 41984 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 52224 +- LdsOffsetB_Blk: 183296 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 68608 +- LdsOffsetMetadata_Blk: 183296 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 59392 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9128,11 +23344,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9140,26 +23356,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9168,20 +23384,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 12 +- NumLoadsB: 4 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 4 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9189,14 +23405,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 41 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 103 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9205,20 +23421,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9233,17 +23449,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9264,7 +23480,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9273,24 +23489,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x64_MI32pFpRnCTRz6vaWNPVl89tBwYKjfRUTdB0PCk3fs4ta44= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x3NGjAvqRTjwQjYCIZJn66CGLrl458ZSxgtI8A4Ga1LQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9303,7 +23520,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -9315,35 +23532,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 256 ++ LSCB: 128 + LSPA: 16 +- LSPB: 4 ++ LSPB: 8 + LVCA: 16 +- LVCB: 64 ++ LVCB: 32 + LVPA: 4 +- LVPB: 1 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 65536 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 164352 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 +- LdsOffsetMetadata_Blk: 164352 +- LdsPadA: 4 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -9352,38 +23569,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 4] ++ MIWaveTileA: 2 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9392,20 +23609,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9419,8 +23636,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 42 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 104 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9429,20 +23646,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 4 ++ ThreadTileA: 8 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9457,8 +23674,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -9466,8 +23683,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9488,7 +23705,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9497,24 +23714,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x64_MI32Da_ba0-R56Omg8XqvNSGvaV7GCQ6v3PbZ--xsrEod1Q= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x1T7KsL28NlsOcdtOJjqWFvroWubLpf_iEWbRIXKEIT_w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9539,36 +23757,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 ++ LSCB: 32 + LSPA: 16 +- LSPB: 16 ++ LSPB: 32 + LVCA: 16 +- LVCB: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 ++ LVPB: 8 + LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82944 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 82944 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 49152 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 164864 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82944 +- LdsOffsetMetadata_Blk: 164864 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9576,11 +23794,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9592,22 +23810,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9616,20 +23834,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 12 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9643,8 +23861,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 43 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 105 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9656,16 +23874,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 3 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9688,10 +23906,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9712,7 +23930,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9721,24 +23939,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32caS7Lofsmj441POulVz6inThn0yOaP4J2E_wgR41Aro= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x1ICadMNyZxhO-zl8pn7vreyBJF-Q8P6cNfqfein7OgyA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9763,35 +23982,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 128 ++ LSCB: 64 + LSPA: 16 +- LSPB: 8 ++ LSPB: 16 + LVCA: 16 +- LVCB: 32 ++ LVCB: 16 + LVPA: 4 +- LVPB: 2 ++ LVPB: 4 + LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 66560 ++ LdsBytesNoAmax: 33792 + LdsInitCVgprs: false +- LdsNumBytes: 66560 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 32768 +- LdsNumElementsAlignedMetadata: 0 +- LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 164864 ++ LdsNumBytes: 33792 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 66560 +- LdsOffsetMetadata_Blk: 164864 +- LdsPadA: 4 ++ LdsOffsetMetadata: 33792 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -9800,11 +24019,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9816,22 +24035,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9840,20 +24059,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9867,8 +24086,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 44 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 106 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9880,16 +24099,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 2 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9912,10 +24131,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9934,9 +24153,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9945,40 +24164,41 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16xNSYERS3r4znhyrjFnjr1GqvW3UZzFvGfGs8RIVUesf0= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x1aSRGHoi4bYjqUx8lp36ZL3M1z3Dsm5y1lFIx0aMWNaM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -9987,36 +24207,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 +- LSPA: 4 +- LSPB: 4 +- LVCA: 64 +- LVCB: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115712 ++ LdsBytesNoAmax: 26624 + LdsInitCVgprs: false +- LdsNumBytes: 115712 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 99328 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -10036,14 +24256,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10055,7 +24275,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -10064,20 +24284,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 32 +- NumLoadsB: 16 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 32 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10085,14 +24305,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 45 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 107 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10101,20 +24321,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10129,8 +24349,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -10138,8 +24358,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10160,7 +24380,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10169,24 +24389,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x64_MI32xHKfOLHOufIrPd8okTWCLLqtPf4wM0pEqbc67ystw8AI= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16xKsd2yOpSVD1APS-vZ3yflFSfMXlnX0hxN4F-4ymbNjQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10199,7 +24420,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -10211,7 +24432,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 256 +@@ -10221,25 +24442,25 @@ + LVCB: 64 + LVPA: 4 + LVPB: 1 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82432 ++ LdsBytesNoAmax: 74752 + LdsInitCVgprs: false +- LdsNumBytes: 82432 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 74752 ++ LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 147968 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82432 +- LdsOffsetMetadata_Blk: 147968 +- LdsPadA: 4 ++ LdsOffsetMetadata: 74752 ++ LdsOffsetMetadata_Blk: 140288 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -10248,38 +24469,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 256 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -10288,19 +24509,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 ++ NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 ++ NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -10309,14 +24530,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 46 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 108 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10325,20 +24546,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10353,8 +24574,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -10362,8 +24583,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10382,9 +24603,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10393,24 +24614,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x64_MI16xXJO1yiqi_bDOc3CPwpWLw76PwvFqFsJoNYddvj5Bxow= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16xAXyi8t5sMANlRq4ST1avyH2nh2h9Np8rw0Bg2sgkFBY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10423,7 +24645,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -10435,43 +24657,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 ++ LSCB: 32 + LSPA: 16 +- LSPB: 16 ++ LSPB: 32 + LVCA: 16 +- LVCB: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 3072 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3584 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 67584 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 49152 ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 58368 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82944 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 140288 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10483,15 +24705,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 7] ++ MIWaveTileA: 1 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10503,8 +24725,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10512,20 +24734,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 12 +- NumLoadsA: 4 +- NumLoadsB: 12 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 14 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 7 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10539,30 +24761,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 47 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 109 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3584_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10577,17 +24799,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10606,9 +24828,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10617,24 +24839,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI16xsUp5B_B-TOQBH4MuMo9UIajAWAPfSKqXRb3frpNNHgs= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16xJ2tUe2MOrr4KhcjEzRhUuRgF82Wox1pN_4YFCypci40= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10647,7 +24870,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -10659,34 +24882,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 128 ++ LSCB: 64 + LSPA: 16 +- LSPB: 8 ++ LSPB: 16 + LVCA: 16 +- LVCB: 32 ++ LVCB: 16 + LVPA: 4 +- LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 2048 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 3072 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 58368 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 58368 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 ++ LdsOffsetMetadata: 58368 ++ LdsOffsetMetadata_Blk: 74752 + LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 +@@ -10694,8 +24917,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10707,15 +24930,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10727,8 +24950,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10736,20 +24959,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 4 +- NumLoadsB: 8 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10757,14 +24980,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 48 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 110 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB3072_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10773,20 +24996,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10801,17 +25024,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10830,9 +25053,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10841,31 +25064,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x64_MI32x3n0knr745biPR-YbUHfMhF4CkUWIrB7_w6nGNY5inQ8w= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16xBoxk0oBj0dw95_Q4yhe0GSoHcnoj8CYIBcAXphv_XMA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -10883,48 +25107,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 +- LSPA: 4 +- LSPB: 16 +- LVCA: 64 +- LVCB: 16 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 + LVPA: 4 +- LVPB: 4 ++ LVPB: 8 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 41984 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 82944 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -10932,27 +25156,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] ++ MIWaveTile: [1, 5] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10960,20 +25184,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsCoalescedB: 5 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10981,14 +25205,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 49 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 111 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2560_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11000,17 +25224,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11032,10 +25256,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11056,7 +25280,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11065,24 +25289,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x64_MI32x3surL8TJiuT4KUInSoEmgZIgy74hqSfVjELQICbpOObA= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x1MbaNBmB7CjVqKDePe1Q6wkqdfaTXhajr3z2cct-Tpc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11107,35 +25332,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 64 +- LSPA: 8 ++ LSCB: 128 ++ LSPA: 16 + LSPB: 8 + LVCA: 16 +- LVCB: 16 +- LVPA: 2 ++ LVCB: 32 ++ LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 25088 ++ LdsBytesNoAmax: 41984 + LdsInitCVgprs: false +- LdsNumBytes: 25088 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 41984 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 25088 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 ++ LdsOffsetMetadata: 41984 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -11144,61 +25369,61 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 64 ++ MacroTile1: 128 + MacroTileA: 32 +- MacroTileB: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 ++ NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -11211,8 +25436,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 50 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 112 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11224,17 +25449,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11250,16 +25475,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11278,9 +25503,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11289,24 +25514,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x64_MI32x3kDkYEgmdCRgX9WT-fQY5KCEvkHpFPe1ST3dSsrbrsBQ= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x1j4eJB6fjzw3HeYIs5C5Wh8U9kVYcZXeRDew_UwmNHK8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11331,98 +25557,98 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 32 +- LSPA: 8 +- LSPB: 16 ++ LSPA: 16 ++ LSPB: 32 + LVCA: 16 + LVCB: 8 +- LVPA: 2 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57600 ++ LdsBytesNoAmax: 34816 + LdsInitCVgprs: false +- LdsNumBytes: 57600 +- LdsNumElementsAlignedA: 16640 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16640 +- LdsOffsetB_Blk: 49408 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16640 +- LdsOffsetMetadata_Blk: 49408 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 4 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 +- NumThreads: 128 ++ NumLoadsCoalescedB: 3 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -11435,8 +25661,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 51 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 113 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11448,17 +25674,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11480,10 +25706,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11502,9 +25728,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11513,24 +25739,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x64_MI32x3AHQOcHIsIWvZfAqBh3FaDp_D9NoO65Hkf1JS1OW3A-4= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x1vdMdqMtKA1WdnJXhMZbh6xr9-fO6GUpb9Ta6CpAlvF4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: 0 ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11538,7 +25765,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -11546,7 +25773,7 @@ + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -11555,75 +25782,75 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 +- LSCB: 32 ++ LSCB: 64 + LSPA: 16 +- LSPB: 8 ++ LSPB: 16 + LVCA: 16 +- LVCB: 32 ++ LVCB: 16 + LVPA: 4 +- LVPB: 8 ++ LVPB: 4 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49664 ++ LdsBytesNoAmax: 25600 + LdsInitCVgprs: false +- LdsNumBytes: 49664 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 25600 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 1 +- LoopUnroll: 16 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 64 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -11631,21 +25858,21 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 +- NumLoadsB: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11653,14 +25880,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 0 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 52 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 114 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11672,17 +25899,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11698,16 +25925,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11728,7 +25955,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11737,24 +25964,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x128_MI32fhd6JrGc2jniWKHAmhPbJqxJbxWqvVYOdPC-JL3Dlzc= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x1-RuceYATHL63pYLqjm6LYX33jAiRgrnJvYaMWkOjS4o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11779,36 +26007,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 64 +- LSPA: 8 +- LSPB: 16 +- LVCA: 32 +- LVCB: 16 +- LVPA: 2 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LSCA: 64 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 4 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 134144 ++ LdsBytesNoAmax: 18432 + LdsInitCVgprs: false +- LdsNumBytes: 134144 +- LdsNumElementsAlignedA: 101376 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 18432 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 101376 +- LdsOffsetB_Blk: 363520 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 134144 +- LdsOffsetMetadata_Blk: 363520 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 18432 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -11816,11 +26044,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -11828,26 +26056,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -11856,20 +26084,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 24 +- NumLoadsB: 8 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 24 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11877,18 +26105,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 53 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 115 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -11896,16 +26124,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 48 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -11928,16 +26156,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -11952,7 +26180,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11961,24 +26189,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x128_MI3vMI2rxyeHKhSHUeGr_8Ys_0-e6n1pB5oaguOhBaAIQw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x2gO1kPgHRz28G4zLr6-9arj30hueNPTjSlsfOV4B6vU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11991,7 +26220,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -12003,36 +26232,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 ++ LSCA: 64 ++ LSCB: 16 ++ LSPA: 16 ++ LSPB: 64 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 4 ++ LVPB: 16 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 80896 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 65536 ++ LdsNumBytes: 80896 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 13312 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 328704 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 328704 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 80896 ++ LdsOffsetMetadata_Blk: 198656 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12040,38 +26269,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12080,20 +26309,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 ++ NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12101,36 +26330,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 54 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 116 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12145,8 +26374,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -12154,14 +26383,14 @@ + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12176,7 +26405,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12185,24 +26414,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x128_MI16q2_sR3aw3JdMu36KaCXgZUUhao-LkKKhXUNNR0Ed7cw= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16xJCHLzJhsFR-TtkmRGrkiKp_q3CzX6s0a2xkJlQsKyjg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12227,36 +26457,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 64 +- LSPA: 8 +- LSPB: 16 +- LVCA: 32 +- LVCB: 16 +- LVPA: 2 +- LVPB: 4 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 64 ++ LSCB: 16 ++ LSPA: 16 ++ LSPB: 64 ++ LVCA: 16 ++ LVCB: 4 ++ LVPA: 4 ++ LVPB: 16 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 72704 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 72704 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 197632 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 +- LdsOffsetMetadata_Blk: 197632 ++ LdsOffsetMetadata: 72704 ++ LdsOffsetMetadata_Blk: 198656 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12264,8 +26494,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -12275,15 +26505,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 1] + MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -12295,7 +26525,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12303,21 +26533,21 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 +- NumLoadsB: 8 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12331,12 +26561,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 55 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 117 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12344,17 +26574,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 8 +- SubGroup1: 32 +- SubGroupA: 8 +- SubGroupB: 32 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 2 ++ ThreadTile1: 1 + ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12370,22 +26600,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 8, 1] ++ WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12400,7 +26630,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12409,24 +26639,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x128_MI32tajdU9Unw3xSmwnRv1FpwKsWrEbh_UUyRPzpYIo5Jbk= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x5b-srJPCspzx9X-RlaXmJ3keyDvZOrDG5E23T5Z7PWw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12451,35 +26682,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 64 +- LSPA: 8 +- LSPB: 16 +- LVCA: 32 +- LVCB: 16 +- LVPA: 2 +- LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LSCA: 64 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 4 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 79360 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 98304 ++ LdsNumBytes: 79360 ++ LdsNumElementsAlignedA: 13824 ++ LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 295936 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 13824 ++ LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 295936 +- LdsPadA: 4 ++ LdsOffsetMetadata: 79360 ++ LdsOffsetMetadata_Blk: 144896 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -12488,38 +26719,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12528,20 +26759,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 24 ++ NumLoadsA: 3 ++ NumLoadsB: 16 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 3 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12549,18 +26780,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 56 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 118 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12568,17 +26799,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12594,22 +26825,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12624,7 +26855,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12633,24 +26864,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x128_MI168pIfMggTCrpA-mVY7mUTrIF4REifG3Jr6YHXhRpgJ_E= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16xl23KPs3clig_WWapi63iF-QVkdvMVkFDPy-laVZtkKI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12663,7 +26895,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -12675,34 +26907,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 2048 ++ LSCA: 64 ++ LSCB: 256 ++ LSPA: 16 ++ LSPB: 4 ++ LVCA: 16 ++ LVCB: 64 ++ LVPA: 4 ++ LVPB: 1 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 4096 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 70144 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 70144 ++ LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 65536 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 164352 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 +- LdsOffsetMetadata_Blk: 164352 ++ LdsOffsetMetadata: 70144 ++ LdsOffsetMetadata_Blk: 135680 + LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 +@@ -12712,8 +26944,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -12724,14 +26956,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -12743,7 +26975,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12752,19 +26984,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -12773,36 +27005,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 57 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 119 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB4096_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12817,8 +27049,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -12826,14 +27058,14 @@ + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12846,9 +27078,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12857,24 +27089,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32xjOy80Tk-BpbQ5ogZrEkDVDXivnW5gTGwvXYXSP4ohfc= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16xQPKSvrdqjM1Uv-cd98hJjSunYbMPUGjgBM_yFKxI178= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12899,48 +27132,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 +- LSCB: 64 ++ LSCB: 32 + LSPA: 8 +- LSPB: 16 ++ LSPB: 32 + LVCA: 32 +- LVCB: 16 ++ LVCB: 8 + LVPA: 2 +- LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99328 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -12948,48 +27181,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13003,29 +27236,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 58 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 120 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 12 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13048,10 +27281,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13070,9 +27303,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13081,24 +27314,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI32xOEGRO_LUQVts-ywwIjvNSAloqNqcS_7it2iV0of05pg= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16xdQn1pvduk3VhYlrZkLfWJnKxChQbC63-vDFwvZRzHjg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13111,7 +27345,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -13123,35 +27357,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 64 +- LSPA: 4 +- LSPB: 8 ++ LSPA: 8 ++ LSPB: 16 + LVCA: 32 + LVCB: 16 +- LVPA: 1 +- LVPB: 2 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 0 ++ LVPA: 2 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 66560 ++ LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 +- LdsPadA: 4 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -13160,38 +27394,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 32 ++ MacroTile0: 64 + MacroTile1: 64 +- MacroTileA: 32 ++ MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -13200,21 +27434,21 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 +- NumLoadsB: 16 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 +- NumThreads: 128 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -13227,8 +27461,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 59 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 121 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13237,20 +27471,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13265,17 +27499,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13294,9 +27528,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13305,24 +27539,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x_Hb7pB8CwCvicoL9G8_yOvUNjTMHnNEBZGhm5fQOGHI= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16xAeM5VIIhfyIkaq1xE3ddp79R-YltseXQL_qqwb_PT-4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13335,7 +27570,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -13347,57 +27582,57 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 +- LSPA: 4 +- LSPB: 16 ++ LSPA: 8 ++ LSPB: 32 + LVCA: 32 + LVCB: 8 +- LVPA: 1 +- LVPB: 4 ++ LVPA: 2 ++ LVPB: 8 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115200 ++ LdsBytesNoAmax: 52224 + LdsInitCVgprs: false +- LdsNumBytes: 115200 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 +@@ -13409,50 +27644,50 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 60 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 122 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13461,19 +27696,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 8 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13489,17 +27724,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13518,9 +27753,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13529,24 +27764,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI32xdDpP3pjfEMbCa_QlMd710qoX26O59wb0P2gCy_rDMbI= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16xFPELKFsxtiD7x2o8aDdiE_n-frD7C_qb52MTG6aYY6U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13571,7 +27807,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 32 +@@ -13581,85 +27817,85 @@ + LVCB: 8 + LVPA: 2 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98560 ++ LdsBytesNoAmax: 68608 + LdsInitCVgprs: false +- LdsNumBytes: 98560 +- LdsNumElementsAlignedA: 16640 +- LdsNumElementsAlignedB: 16384 ++ LdsNumBytes: 68608 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16640 +- LdsOffsetB_Blk: 82176 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16640 +- LdsOffsetMetadata_Blk: 82176 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 68608 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 +- LoopUnroll: 32 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 96 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 +- NumLoadsB: 4 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 +- NumLoadsCoalescedB: 1 ++ NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 +@@ -13669,14 +27905,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 61 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 123 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1536_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13688,17 +27924,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13720,10 +27956,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13744,7 +27980,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13753,24 +27989,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x256_MI32xuhfigbyT3jLfMlDEHl7BYvkRoLB2HcpaRGVzvNqdhTo= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16xxcjpkLD6p_Rk8GRxpkKLYo_D5oEVDx0vSeV6vIOCFB8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: 0 ++ DepthU: 128 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13795,35 +28032,35 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 128 + LSCB: 64 +- LSPA: 4 ++ LSPA: 8 + LSPB: 16 +- LVCA: 64 ++ LVCA: 32 + LVCB: 16 +- LVPA: 1 ++ LVPA: 2 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 65536 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 328704 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 328704 +- LdsPadA: 4 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +@@ -13832,11 +28069,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 16 +- LoopUnroll: 256 ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -13844,48 +28081,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 64 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13899,12 +28136,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 62 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 124 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -13912,17 +28149,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13938,22 +28175,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -13966,9 +28203,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13977,24 +28214,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI32xvelKKc0tJ-ymMD1MHz_F3unfFfe-5_U37ANTg7OQxes= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16xST-W69nW6k2A3XtqqrjIyZppffVIlTV_jlt61vyKmwg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 128 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14019,55 +28257,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 128 + LSCB: 32 +- LSPA: 4 ++ LSPA: 8 + LSPB: 32 +- LVCA: 64 ++ LVCA: 32 + LVCB: 8 +- LVPA: 1 ++ LVPA: 2 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 35840 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 32768 ++ LdsNumBytes: 35840 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99328 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 4 +- LoopUnroll: 64 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14081,35 +28319,35 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14123,29 +28361,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 63 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 125 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14168,16 +28406,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14192,7 +28430,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14201,24 +28439,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x512_MI32xSwdlZZdcgBbCkhAtUvwnOfsEfVjP3Mmb44yf8s6vJ2U= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x3oKxWloiBFVSkVEtWiI9zSrb-Klm4P_KBBFeOy6lDGs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 512 +- DirectToLds: 0 ++ DepthU: 256 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14243,55 +28482,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 512 ++ LSCA: 256 + LSCB: 32 +- LSPA: 2 ++ LSPA: 4 + LSPB: 32 +- LVCA: 128 ++ LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 0 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 131584 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 131584 +- LdsNumElementsAlignedA: 66048 +- LdsNumElementsAlignedB: 65536 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 328192 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 131584 +- LdsOffsetMetadata_Blk: 328192 +- LdsPadA: 4 +- LdsPadB: 0 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 +- LoopUnroll: 128 ++ LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14305,13 +28544,13 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -14320,20 +28559,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14347,12 +28586,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 64 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB0_LBSPPM0_LPA4_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 126 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 2048 ++ StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -14360,16 +28599,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14392,16 +28631,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 512 +- _DepthUA: 512 +- _DepthUB: 512 +- _DepthUMetadata: 512 ++ _DepthU: 256 ++ _DepthUA: 256 ++ _DepthUB: 256 ++ _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14416,7 +28655,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14425,24 +28664,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x7B2uugV2PGMFe1usV3dB0R8HNiixA_WcCRGUfKJykDk= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16xKvGGrD9SXYqt4vOi2UiWQUsiU-5ceQ8zjAZb_izsPFU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14450,7 +28690,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -14458,7 +28698,7 @@ + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -14467,14 +28707,14 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 16 + LSPA: 8 +- LSPB: 16 ++ LSPB: 64 + LVCA: 32 +- LVCB: 16 ++ LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 512 +@@ -14535,7 +28775,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -14544,8 +28784,8 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -14553,11 +28793,11 @@ + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 +- NumLoadsB: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14566,13 +28806,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 65 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 127 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14584,7 +28824,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14618,8 +28858,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14640,7 +28880,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14649,24 +28889,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x128_MI16xsYXwX37LOOnKO8vzLWylhsJSOpRqf6p9vPXhzRJpOzc= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16xpTgSbrv07pr9UAZlU5wIjwp27TDkl6fY9XwbxiEpAT8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14679,7 +28920,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -14691,7 +28932,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 16 +@@ -14701,24 +28942,24 @@ + LVCB: 4 + LVPA: 2 + LVPB: 16 +- LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 27136 ++ LdsBytesNoAmax: 27648 + LdsInitCVgprs: false +- LdsNumBytes: 27136 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 27648 ++ LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 49664 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 27136 +- LdsOffsetMetadata_Blk: 49664 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 +@@ -14759,23 +29000,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -14790,13 +29031,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 66 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 128 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14805,10 +29046,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14833,7 +29074,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -14842,8 +29083,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14862,9 +29103,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14873,24 +29114,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x128_MI16xi6MStsYfk1HgtM5DI_oa_j5tCF1qn7yyZAAYN9us2H4= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16xA3p95UU_Eci5aX2_Esb291AiPMccc5dTuDypmUc2eAY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14903,7 +29145,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -14915,7 +29157,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 16 +@@ -14925,33 +29167,33 @@ + LVCB: 4 + LVPA: 2 + LVPB: 16 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 8192 ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 10240 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -14983,23 +29225,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -15014,13 +29256,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 67 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 129 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15029,10 +29271,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15057,7 +29299,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15066,8 +29308,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15086,9 +29328,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15097,7 +29339,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16xzbND2KI11vu3jrNREFApmvYbuivQWxxjUkU6joUe7zY= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16xHKfHJ-pKJYQ8cD6MKItJhSdri1_6yZlVMjS_GPJ4RQk= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -15108,13 +29350,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15139,7 +29382,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 32 +@@ -15152,11 +29395,11 @@ + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57856 ++ LdsBytesNoAmax: 27136 + LdsInitCVgprs: false +- LdsNumBytes: 57856 ++ LdsNumBytes: 27136 + LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +@@ -15165,10 +29408,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 ++ LdsOffsetMetadata: 27136 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 +@@ -15207,7 +29450,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -15215,9 +29458,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -15243,8 +29486,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 68 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 130 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15256,7 +29499,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15282,7 +29525,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15290,8 +29533,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15310,9 +29553,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15321,7 +29564,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x128_MI16xJ75tS8LfR4Zs_rV2aAK1CZCdieaCCmmO_Hokjqyfk8k= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16xYUWARyitwBcDT9VHHYErzM9IqMP8xALismPxpB5H_fM= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -15332,13 +29575,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15363,7 +29607,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 64 +@@ -15376,11 +29620,11 @@ + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 43520 + LdsInitCVgprs: false +- LdsNumBytes: 107008 ++ LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -15389,10 +29633,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 ++ LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 +@@ -15431,7 +29675,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -15439,13 +29683,13 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 +@@ -15467,8 +29711,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 69 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 131 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15480,7 +29724,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15506,7 +29750,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15514,8 +29758,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15534,9 +29778,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15545,24 +29789,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x256_MI16xjBh0s3cmhXcgz38fCl1y8jcTuNE-2cvELtzoWePK5jk= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16xM6_7qKHFTorTZRXjA8TdFsX5trzoNnmYLpVflaj_V7U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15587,7 +29832,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 16 +@@ -15598,13 +29843,13 @@ + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 37376 + LdsInitCVgprs: false +- LdsNumBytes: 98816 ++ LdsNumBytes: 37376 + LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 16384 ++ LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -15613,17 +29858,17 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 ++ LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -15655,17 +29900,17 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -15691,8 +29936,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 70 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 132 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15704,7 +29949,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15738,8 +29983,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15760,7 +30005,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15769,24 +30014,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16xmHfGGCooXwsuG_OJXRKswRdIdsbCF2EXrggTavYCK9o= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16xLKo99AO5fELjySC9RILSSfyfJ31fKjifrRydeEvEZ4A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15799,7 +30045,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -15811,7 +30057,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 16 +@@ -15821,24 +30067,24 @@ + LVCB: 4 + LVPA: 1 + LVPB: 16 +- LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53760 ++ LdsBytesNoAmax: 54272 + LdsInitCVgprs: false +- LdsNumBytes: 53760 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 54272 ++ LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 53760 +- LdsOffsetMetadata_Blk: 98816 ++ LdsOffsetMetadata: 54272 ++ LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 +@@ -15879,23 +30125,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 +@@ -15915,8 +30161,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 71 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 133 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15925,10 +30171,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15953,7 +30199,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15962,8 +30208,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15984,7 +30230,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15993,24 +30239,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x256_MI16xkI-RfSqldyagXO02oIrx9GSMzGvWiTFzF3APoKtV8DE= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x4wxVTz093vrwLNCtyNIwInCxDHDuwZMKrNqvI937Gpo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16018,15 +30265,15 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -16035,34 +30282,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 16 + LSPA: 4 +- LSPB: 16 ++ LSPB: 64 + LVCA: 64 +- LVCB: 16 ++ LVCB: 4 + LVPA: 1 + LVPB: 16 +- LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 86528 ++ LdsBytesNoAmax: 88064 + LdsInitCVgprs: false +- LdsNumBytes: 86528 +- LdsNumElementsAlignedA: 66048 ++ LdsNumBytes: 88064 ++ LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 197120 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 86528 +- LdsOffsetMetadata_Blk: 197120 ++ LdsOffsetMetadata: 88064 ++ LdsOffsetMetadata_Blk: 198656 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 +@@ -16103,29 +30350,29 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -16139,8 +30386,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 72 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 134 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16149,10 +30396,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16177,7 +30424,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -16186,8 +30433,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16208,7 +30455,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16217,24 +30464,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x256_MI16xO9-F5sWsUwsyDiq1e3-qfI68M-ep6aXSbMtTeBvSD8s= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16xlSwkEu5xUt6Fq-6TmPJ7cXpQtDKDApP723TBsQzG7bI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16259,7 +30507,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 32 +@@ -16272,11 +30520,11 @@ + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49664 ++ LdsBytesNoAmax: 53760 + LdsInitCVgprs: false +- LdsNumBytes: 49664 ++ LdsNumBytes: 53760 + LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedB: 36864 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -16285,10 +30533,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 49664 ++ LdsOffsetMetadata: 53760 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 +@@ -16327,7 +30575,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -16335,13 +30583,13 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 +@@ -16363,8 +30611,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 73 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 135 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16376,7 +30624,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16402,7 +30650,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16410,8 +30658,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16432,7 +30680,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16441,24 +30689,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x256_MI16x9TbH1CrdYD_4vvURkUX_ECcbdfgkMKB33wu3aAdtis0= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16xKpLNbZjON44kKCEHca4OPC--UEy2IOsrn7kfs32D3EM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16483,7 +30732,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 64 +@@ -16496,11 +30745,11 @@ + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82432 ++ LdsBytesNoAmax: 86528 + LdsInitCVgprs: false +- LdsNumBytes: 82432 ++ LdsNumBytes: 86528 + LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 65536 ++ LdsNumElementsAlignedB: 69632 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -16509,10 +30758,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82432 ++ LdsOffsetMetadata: 86528 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 +@@ -16551,7 +30800,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -16559,13 +30808,13 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 +@@ -16587,8 +30836,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 74 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 136 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16600,7 +30849,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16626,7 +30875,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16634,8 +30883,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16654,9 +30903,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16665,24 +30914,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x512_MI16xzvgbOIB8AJHCOBqrGYTcw5gY0VMlCiMtQOWLfCWWhyM= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16xCbNt20QFw21kRdfE6GUhjHM32hM2BgrZUKX7BydtQmY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16690,7 +30940,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -16698,7 +30948,7 @@ + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true +- GuaranteeNoPartialB: true ++ GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 +@@ -16707,43 +30957,43 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 + LSPA: 2 +- LSPB: 16 ++ LSPB: 64 + LVCA: 128 +- LVCB: 16 ++ LVCB: 4 + LVPA: 1 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 74240 + LdsInitCVgprs: false +- LdsNumBytes: 132096 ++ LdsNumBytes: 74240 + LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 32768 ++ LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66048 ++ LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99328 ++ LdsOffsetMetadata: 74240 ++ LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false +@@ -16775,29 +31025,29 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 +- NumLoadsB: 32 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 32 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -16811,20 +31061,20 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 75 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 137 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 2048 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16858,8 +31108,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -16880,7 +31130,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16889,24 +31139,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x512_MI16xRfloz7-QLYI3UYa6vUS22X8Soil0Yg89MIbpvHz5hag= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x0lJaZrmdU8wT0DLyEO2qL_LIzzTO_k14WTNH4SXKnxg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16919,7 +31170,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false +@@ -16931,7 +31182,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 16 +@@ -16941,24 +31192,24 @@ + LVCB: 4 + LVPA: 1 + LVPB: 16 +- LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107008 ++ LdsBytesNoAmax: 107520 + LdsInitCVgprs: false +- LdsNumBytes: 107008 +- LdsNumElementsAlignedA: 66048 ++ LdsNumBytes: 107520 ++ LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 40960 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 197120 ++ LdsOffsetB: 66560 ++ LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 107008 +- LdsOffsetMetadata_Blk: 197120 ++ LdsOffsetMetadata: 107520 ++ LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 16 + LdsPadMetadata: 0 +@@ -16999,23 +31250,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 +@@ -17029,14 +31280,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 76 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 138 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17045,10 +31296,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17073,7 +31324,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -17082,8 +31333,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -17104,7 +31355,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -17113,24 +31364,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x512_MI16xsi9dF4p65Dz8_g2YvBf4poaFJH7JhMkB8sUCjOcQnEE= ++ BaseName: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16xru4mTIAM1Ap7veXW9YFk-_8664TFpBX_95Vc0Vbp9X0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -17155,7 +31407,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 32 +@@ -17168,11 +31420,11 @@ + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 107008 + LdsInitCVgprs: false +- LdsNumBytes: 98816 ++ LdsNumBytes: 107008 + LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 65536 ++ LdsNumElementsAlignedB: 73728 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -17181,10 +31433,10 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 98816 ++ LdsOffsetMetadata: 107008 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 +- LdsPadB: 0 ++ LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 +@@ -17223,7 +31475,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -17232,8 +31484,8 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -17259,8 +31511,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 77 +- SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 139 ++ SolutionNameMin: Cijk_Alik_Bjlk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17272,7 +31524,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17298,7 +31550,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -17306,8 +31558,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +index 1a9b577d94..8905ce0c14 100644 +--- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml ++++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Origami/gfx950_Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs.yaml +@@ -1,7 +1,7 @@ + - {MinimumRequiredVersion: 5.0.0} + - gfx950 + - gfx950 +-- [Device 0049, Device 0050] ++- [Device 75a0] + - Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false +@@ -235,7 +235,7 @@ + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Custom_Cijk_Alik_Bljk_S_MX_B_BIAS_HA_S_SAV_NTD_SK3_UserArgs_MT256x256x32_MI16x16x1_shortname0_gfx950 +- SourceSwap: false ++ SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 +@@ -265,7 +265,6 @@ + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 +- UseDot2F32XEmulation: true + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 +@@ -301,9 +300,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -312,31 +311,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32Hz-tNi2vW1nZqIYXkVLPiovQC2JnikedCteeXKffILk= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16Mblhr8Bq6azeHsu3Q1FGGsY5yqeoKlhdtU76OZMbqPQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -354,36 +354,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 64 +- LVCA: 16 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 100352 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 100352 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 82944 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 165888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 82944 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 165888 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -392,10 +392,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -403,26 +403,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveTile: [8, 7] ++ MIWaveTileA: 8 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 256 ++ MacroTile1: 224 + MacroTileA: 256 +- MacroTileB: 256 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -431,20 +431,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 256 +- NumGlobalWriteVectorsPerThread: 64 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 8 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -452,18 +452,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 0 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x256x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 1 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -471,17 +471,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 4 +- ThreadTileA: 64 +- ThreadTileB: 4 ++ ThreadTile0: 32 ++ ThreadTile1: 7 ++ ThreadTileA: 32 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -497,22 +497,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -525,9 +525,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -536,24 +536,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32BGyzXBVQc5knU7WuSv2zecCpe9-GDKVHnUPRiobEgXE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16sx-DCFh2Lc-0cpsKRwtUwsp-irEsbtOimGff9bXE1lM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -578,36 +579,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 31232 ++ LdsBytesNoAmax: 128000 + LdsInitCVgprs: false +- LdsNumBytes: 31232 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 13824 ++ LdsNumBytes: 128000 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 31232 +- LdsOffsetMetadata_Blk: 50176 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -616,10 +617,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -627,9 +628,9 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveTile: [8, 6] ++ MIWaveTileA: 8 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 +@@ -640,13 +641,13 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -655,20 +656,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 4 +- NumLoadsB: 3 ++ NumLoadsA: 8 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 3 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -677,17 +678,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 1 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 2 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -695,17 +696,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 3 +- ThreadTileA: 64 +- ThreadTileB: 3 ++ ThreadTile0: 32 ++ ThreadTile1: 6 ++ ThreadTileA: 32 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -721,22 +722,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -749,9 +750,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -760,24 +761,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32jzbNFMG50LwwHgfH3JytrCYq5K76JDDwfxqBWVFfC7w= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16CE1g6SzR9XNsp1CIaI2DLSdNlKoaRTfZIliJrwPrjMQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -802,36 +804,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 26624 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 26624 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 26624 +- LdsOffsetMetadata_Blk: 50176 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -840,10 +842,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -851,26 +853,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [8, 5] ++ MIWaveTileA: 8 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 128 ++ MacroTile1: 160 + MacroTileA: 256 +- MacroTileB: 128 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -879,20 +881,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 2 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 8 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -901,17 +903,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 2 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 3 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -919,17 +921,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 32 ++ ThreadTile1: 5 ++ ThreadTileA: 32 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -945,22 +947,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -973,9 +975,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -984,7 +986,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32x8TiWhLBm4AzO8i4h6zsRgnyo-hdHtfGXKkVnNjaieg8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16pm458AaGX7JxGtUr8nqil6Z913Tf2zLmGBqIRg3MP0M= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -994,14 +996,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1014,7 +1017,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -1026,36 +1029,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 128 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 55808 ++ LdsBytesNoAmax: 52224 + LdsInitCVgprs: false +- LdsNumBytes: 55808 +- LdsNumElementsAlignedA: 18432 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 18432 +- LdsOffsetB_Blk: 51200 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 18432 +- LdsOffsetMetadata_Blk: 51200 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -1064,37 +1067,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 4] ++ MIWaveTileA: 8 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 +- MacroTile1: 64 ++ MacroTile1: 128 + MacroTileA: 256 +- MacroTileB: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -1103,20 +1106,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 ++ NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 1 ++ NumLoadsA: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1130,20 +1133,20 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 3 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 4 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 +@@ -1151,9 +1154,9 @@ + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 32 +- ThreadTile1: 2 ++ ThreadTile1: 4 + ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1168,23 +1171,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1199,7 +1202,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1208,24 +1211,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x16_MI329TPdsK9mW1Hy3s2qtZveZz7Zw9fpanelLX34kbbYKzA= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16xLLGCKGBBRnkwounRSPIppOmeqcZBKCYgl_Yk6H23wjk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1238,7 +1242,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -1250,36 +1254,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 31232 ++ LdsBytesNoAmax: 50176 + LdsInitCVgprs: false +- LdsNumBytes: 31232 +- LdsNumElementsAlignedA: 13824 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 13824 +- LdsOffsetB_Blk: 46592 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 31232 +- LdsOffsetMetadata_Blk: 46592 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -1288,10 +1292,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1299,26 +1303,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveTile: [8, 3] ++ MIWaveTileA: 8 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 256 +- MacroTileA: 192 +- MacroTileB: 256 ++ MacroTile0: 256 ++ MacroTile1: 96 ++ MacroTileA: 256 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -1327,20 +1331,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 +- NumLoadsA: 3 +- NumLoadsB: 4 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 8 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 3 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1348,36 +1352,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 4 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 5 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 4 +- ThreadTileA: 48 +- ThreadTileB: 4 ++ ThreadTile0: 32 ++ ThreadTile1: 3 ++ ThreadTileA: 32 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1392,23 +1396,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1421,9 +1425,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1432,37 +1436,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x16_MI32YfR9kog35jdrcZRh6NOYAR70b6ySHPU8eHwR6TX1f-Y= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16xo6JGVHcUDUiUKjJMmlIvC6oagI3kJDkqYaLEnmwsYcM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -1474,48 +1479,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58880 ++ LdsBytesNoAmax: 44032 + LdsInitCVgprs: false +- LdsNumBytes: 58880 +- LdsNumElementsAlignedA: 13056 +- LdsNumElementsAlignedB: 13056 ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 13056 +- LdsOffsetB_Blk: 45824 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 13056 +- LdsOffsetMetadata_Blk: 45824 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1523,27 +1528,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [8, 2] ++ MIWaveTileA: 8 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 256 ++ MacroTile1: 64 ++ MacroTileA: 256 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1551,20 +1556,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 12 +- NumLoadsB: 12 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1573,35 +1578,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 5 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 6 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 32 ++ ThreadTile1: 2 ++ ThreadTileA: 32 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1616,23 +1621,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1645,9 +1650,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1656,7 +1661,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32Z3OGQQr-e_YOT_8YNpE_3O7ieorQ0Q1V6MoCKfgo_B0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16xZqNAo4xmsRdmTdCvveRRKYkeryYAR5H1NGDmkOl61Ro= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -1666,14 +1671,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -1686,7 +1692,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -1698,48 +1704,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53568 ++ LdsBytesNoAmax: 39936 + LdsInitCVgprs: false +- LdsNumBytes: 53568 +- LdsNumElementsAlignedA: 12480 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 39936 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 12480 +- LdsOffsetB_Blk: 45248 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 12480 +- LdsOffsetMetadata_Blk: 45248 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 39936 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1747,27 +1753,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1775,20 +1781,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 3 +- NumLoadsB: 2 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 3 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -1802,30 +1808,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 6 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 7 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 32 ++ ThreadTile1: 1 ++ ThreadTileA: 32 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -1840,23 +1846,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -1869,9 +1875,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -1880,31 +1886,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32xKfebZgPFK1_3UZwphre1GnwHzxV3tKrS4avOT_WHBCY= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16zTOfwjsDs24zKh42muTJeilgVA7QlFCR3degxqxUAxQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -1922,48 +1929,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 64 +- LVCA: 16 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49984 ++ LdsBytesNoAmax: 70656 + LdsInitCVgprs: false +- LdsNumBytes: 49984 +- LdsNumElementsAlignedA: 13056 +- LdsNumElementsAlignedB: 4160 ++ LdsNumBytes: 70656 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 13056 +- LdsOffsetB_Blk: 45824 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 13056 +- LdsOffsetMetadata_Blk: 45824 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 70656 ++ LdsOffsetMetadata_Blk: 166912 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -1971,27 +1978,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [7, 8] ++ MIWaveTileA: 7 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 224 ++ MacroTile1: 256 ++ MacroTileA: 224 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -1999,20 +2006,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 12 +- NumLoadsB: 1 ++ NumElementsPerThread: 224 ++ NumGlobalWriteVectorsPerThread: 224 ++ NumLoadsA: 7 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 1 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2021,17 +2028,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 7 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 8 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -2039,17 +2046,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 28 ++ ThreadTile1: 8 ++ ThreadTileA: 28 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2065,22 +2072,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2095,7 +2102,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2104,24 +2111,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x16_MI327Rr8yHzZrFtDmlDMpOmTJRkCFdagoWwyBnw6ERCbMTo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16lgH3A6Rz-Uiuma6FOpuYHC-FAzl1QstNFe5dSRmuqio= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -2134,7 +2142,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -2146,36 +2154,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 27136 ++ LdsBytesNoAmax: 71680 + LdsInitCVgprs: false +- LdsNumBytes: 27136 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 18432 ++ LdsNumBytes: 71680 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 166912 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 27136 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 71680 ++ LdsOffsetMetadata_Blk: 166912 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2184,37 +2192,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 7] ++ MIWaveTileA: 7 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 224 ++ MacroTile1: 224 ++ MacroTileA: 224 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2223,20 +2231,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 2 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 196 ++ NumGlobalWriteVectorsPerThread: 196 ++ NumLoadsA: 7 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2245,35 +2253,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 8 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 9 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 7 ++ ThreadTileA: 28 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2288,8 +2296,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -2297,14 +2305,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2319,7 +2327,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2328,7 +2336,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32_hsF6JSat5agy4Xt7O-5mgkoZ0tt1XC8yF5V85aoL9Q= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16Z59Tx3bjXsZE2NIiMnGLdDPlIeDGrK12p0tmfVG6Bj8= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -2338,14 +2346,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -2358,7 +2367,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -2370,48 +2379,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 64 +- LVCA: 4 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 53568 ++ LdsBytesNoAmax: 129024 + LdsInitCVgprs: false +- LdsNumBytes: 53568 +- LdsNumElementsAlignedA: 8320 +- LdsNumElementsAlignedB: 12480 ++ LdsNumBytes: 129024 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8320 +- LdsOffsetB_Blk: 41088 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8320 +- LdsOffsetMetadata_Blk: 41088 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2419,27 +2428,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 3] +- MIWaveTileA: 2 +- MIWaveTileB: 3 ++ MIWaveTile: [7, 6] ++ MIWaveTileA: 7 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 128 ++ MacroTile0: 224 + MacroTile1: 192 +- MacroTileA: 128 ++ MacroTileA: 224 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -2447,20 +2456,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 2 +- NumLoadsB: 3 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 168 ++ NumLoadsA: 7 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 3 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2474,30 +2483,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 9 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 10 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 3 +- ThreadTileA: 32 +- ThreadTileB: 3 ++ ThreadTile0: 28 ++ ThreadTile1: 6 ++ ThreadTileA: 28 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2512,23 +2521,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2541,9 +2550,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2552,37 +2561,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32_WlXLrevBbl_0U1Dd9DwU6-BBD6mTzCXou8jOHaiU90= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI161eph_PBuNv3Wu857Wb91TShdlt_umWfBnQcjeEWPJ-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -2594,36 +2604,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 16 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 18432 ++ LdsBytesNoAmax: 126976 + LdsInitCVgprs: false +- LdsNumBytes: 18432 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 126976 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 41984 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 18432 +- LdsOffsetMetadata_Blk: 41984 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 35840 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2632,10 +2642,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2643,26 +2653,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveTile: [7, 5] ++ MIWaveTileA: 7 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 160 ++ MacroTileA: 224 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2671,20 +2681,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 7 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -2693,35 +2703,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 10 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 11 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 5 ++ ThreadTileA: 28 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2736,23 +2746,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2765,9 +2775,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -2776,7 +2786,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32xbvS-9Ni9P1ovLBbjRth2p9Qnn5GwAJ4aHr0mRb6xOsE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI169TlTh60JiZVySmWyo9rnD2pgmNa4d5u6CACo8AfrHBo= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -2786,27 +2796,28 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -2818,36 +2829,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 16 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 30208 ++ LdsBytesNoAmax: 53248 + LdsInitCVgprs: false +- LdsNumBytes: 30208 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 25600 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 9216 +- LdsOffsetMetadata_Blk: 25600 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -2856,10 +2867,10 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -2867,26 +2878,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 1] +- MIWaveTileA: 2 +- MIWaveTileB: 1 ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 224 ++ MacroTile1: 128 ++ MacroTileA: 224 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -2895,19 +2906,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -2916,36 +2927,36 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 11 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 12 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 1 +- ThreadTileA: 32 +- ThreadTileB: 1 ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -2960,23 +2971,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -2991,7 +3002,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3000,37 +3011,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32xZzE6-99anuiPYMyevl-3MnsI9c3Im_AFIHhYRpwHTY8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x6_Sh2VgGaxbR4OiKhmUsbfU61tyE1fMZ_EXcUnXd29E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -3042,36 +3054,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 64 +- LVCA: 16 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 23040 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 23040 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 18432 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 37376 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 23040 +- LdsOffsetMetadata_Blk: 37376 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3080,37 +3092,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 3] ++ MIWaveTileA: 7 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 224 ++ MacroTile1: 96 ++ MacroTileA: 224 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -3119,20 +3131,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 4 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 7 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3141,35 +3153,35 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 12 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 13 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 3 ++ ThreadTileA: 28 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3184,8 +3196,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -3193,14 +3205,14 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3213,9 +3225,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3224,7 +3236,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32xmqUo5yJmHg7xO67aSP37JKZuxkcOQxi77G8yClkmoMs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16xXCdnc34ZMi0vBhXuiTZzTAiZtMej3mRNXCjoU-hyyvo= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -3234,22 +3246,23 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -3266,48 +3279,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 16 +- LVCA: 16 +- LVCB: 16 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 50176 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 50176 +- LdsNumElementsAlignedA: 4352 +- LdsNumElementsAlignedB: 13056 ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 4352 +- LdsOffsetB_Blk: 37120 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4352 +- LdsOffsetMetadata_Blk: 37120 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3315,27 +3328,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -3343,20 +3356,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 4 +- NumLoadsB: 12 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 7 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3370,12 +3383,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 13 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 14 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3383,17 +3396,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3409,22 +3422,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3437,9 +3450,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3448,31 +3461,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32xVHGHoo6kBhnJCZLaYJh4xujyzh3BWf6YirqUvbEcCPU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16xUPP4CKzrEFHoshksrpbTYYrFWaKuRPinrqORNwm-xxg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -3490,48 +3504,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 16 +- LSPB: 64 +- LVCA: 16 +- LVCB: 4 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 29056 ++ LdsBytesNoAmax: 40960 + LdsInitCVgprs: false +- LdsNumBytes: 29056 +- LdsNumElementsAlignedA: 4352 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 40960 ++ LdsNumElementsAlignedA: 35840 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4352 +- LdsOffsetB_Blk: 20736 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 35840 ++ LdsOffsetB_Blk: 101376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4352 +- LdsOffsetMetadata_Blk: 20736 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 40960 ++ LdsOffsetMetadata_Blk: 101376 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3539,27 +3553,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 2] +- MIWaveTileA: 1 +- MIWaveTileB: 2 ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -3567,20 +3581,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 2 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 7 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3589,17 +3603,17 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 14 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 15 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -3607,17 +3621,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 28 ++ ThreadTile1: 1 ++ ThreadTileA: 28 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3633,22 +3647,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3663,7 +3677,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3672,7 +3686,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x3V8Tr8YiE7PaUGoQRjNFpPedbvJE1-ae3IDvQpAWS5lY= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16igSC_EOhmyuevZzbDh6BQ5_4VYGRysf20mBm5H1OSm8= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -3682,14 +3696,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3697,12 +3712,12 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -3714,48 +3729,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 +- LSPA: 64 +- LSPB: 16 +- LVCA: 4 +- LVCB: 16 +- LVPA: 16 +- LVPB: 16 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 256 ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 24896 ++ LdsBytesNoAmax: 128000 + LdsInitCVgprs: false +- LdsNumBytes: 24896 +- LdsNumElementsAlignedA: 4160 +- LdsNumElementsAlignedB: 4352 ++ LdsNumBytes: 128000 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4160 +- LdsOffsetB_Blk: 20544 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4160 +- LdsOffsetMetadata_Blk: 20544 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -3763,48 +3778,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTile: [6, 8] ++ MIWaveTileA: 6 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 256 ++ MacroTileA: 192 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 1 +- NumLoadsB: 4 ++ NumElementsPerThread: 192 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 6 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -3818,30 +3833,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 15 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 16 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 8 ++ ThreadTileA: 24 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -3856,23 +3871,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -3887,7 +3902,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -3896,24 +3911,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x3lFi9PF-LKug7MvK_R38DtWQHEF6R780j5hasUZdvGgI= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16wHWW2wKBFvXKERu7iCfoxfdy9piyLP9SZmfPb1qrRVA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -3926,7 +3942,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -3938,36 +3954,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 ++ LSCA: 32 ++ LSCB: 32 + LSPA: 32 + LSPB: 32 +- LVCA: 4 +- LVCB: 4 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 15104 ++ LdsBytesNoAmax: 129024 + LdsInitCVgprs: false +- LdsNumBytes: 15104 +- LdsNumElementsAlignedA: 2304 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 129024 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 2304 +- LdsOffsetB_Blk: 10496 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 2304 +- LdsOffsetMetadata_Blk: 10496 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -3976,96 +3992,96 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 7] ++ MIWaveTileA: 6 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 224 ++ MacroTileA: 192 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 1 +- NumLoadsB: 2 ++ NumElementsPerThread: 168 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 6 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 1 +- NumLoadsPerpendicularB: 2 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 16 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x16_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 17 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 7 ++ ThreadTileA: 24 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4080,23 +4096,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4109,9 +4125,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4120,7 +4136,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x3GBHxfGA51SvpuF0KMzTl7G0QkCLqEpC0DTaIjpDRV24= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16WukQsBniNYEJPx7xFWf5O07eBveoE9QRboZ9VaCgd0o= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -4130,14 +4146,15 @@ + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 16 +- DirectToLds: 0 ++ DepthU: 32 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4150,7 +4167,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -4162,36 +4179,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 16 +- LSCB: 16 ++ LSCA: 32 ++ LSCB: 32 + LSPA: 32 + LSPB: 32 +- LVCA: 4 +- LVCB: 4 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 6912 ++ LdsBytesNoAmax: 120832 + LdsInitCVgprs: false +- LdsNumBytes: 6912 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 2304 ++ LdsNumBytes: 120832 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 8192 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 12800 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 6912 +- LdsOffsetMetadata_Blk: 12800 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 27648 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -4200,37 +4217,37 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 +- LoopUnroll: 16 ++ LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 6] ++ MIWaveTileA: 6 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 192 ++ MacroTile1: 192 ++ MacroTileA: 192 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4239,57 +4256,57 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 2 +- NumLoadsB: 1 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 1 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 17 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x16_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS64_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 18 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 64 ++ StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 24 ++ ThreadTile1: 6 ++ ThreadTileA: 24 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4304,23 +4321,23 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 16 +- _DepthUA: 16 +- _DepthUB: 16 +- _DepthUMetadata: 16 ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -4333,9 +4350,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4344,7 +4361,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x32_MI16EYlk3XIK8EWXodhq3aV7O8-X32W_-JoaQwAWID52YXw= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16i-afMB5Wg-wzMPV4lZ3l2zcL7LpVpTTBFCc1OChgq-w= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -4355,13 +4372,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4374,7 +4392,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -4386,7 +4404,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -4396,24 +4414,24 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 128000 ++ LdsBytesNoAmax: 53248 + LdsInitCVgprs: false +- LdsNumBytes: 128000 +- LdsNumElementsAlignedA: 34816 +- LdsNumElementsAlignedB: 27648 ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 34816 +- LdsOffsetB_Blk: 100352 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 34816 +- LdsOffsetMetadata_Blk: 100352 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -4435,14 +4453,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [8, 6] +- MIWaveTileA: 8 +- MIWaveTileB: 6 ++ MIWaveTile: [6, 5] ++ MIWaveTileA: 6 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 192 +- MacroTileA: 256 +- MacroTileB: 192 ++ MacroTile0: 192 ++ MacroTile1: 160 ++ MacroTileA: 192 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -4454,7 +4472,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4463,20 +4481,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 6 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4490,8 +4508,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 20 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 19 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4500,20 +4518,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 6 +- ThreadTileA: 32 +- ThreadTileB: 6 ++ ThreadTile0: 24 ++ ThreadTile1: 5 ++ ThreadTileA: 24 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4528,8 +4546,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -4537,8 +4555,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4559,7 +4577,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4568,24 +4586,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x32_MI324wtZXJOma7WqdyYl50R-7ZK-zumozLf-bvUQmYFeS6w= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16JuDvgzIyFuBuPWlKTOYift8BRap_sULNHiQeaWLa6GM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4598,7 +4617,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -4610,7 +4629,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -4620,26 +4639,26 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 51200 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 51200 +- LdsNumElementsAlignedA: 33792 ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 51200 +- LdsOffsetMetadata_Blk: 99328 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -4647,11 +4666,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -4659,26 +4678,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [6, 4] ++ MIWaveTileA: 6 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 256 ++ MacroTile0: 192 + MacroTile1: 128 +- MacroTileA: 256 ++ MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -4687,19 +4706,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -4709,13 +4728,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 21 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 20 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4724,20 +4743,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 24 ++ ThreadTile1: 4 ++ ThreadTileA: 24 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4752,17 +4771,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -4781,9 +4800,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -4792,24 +4811,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32xh1JWsTG3UlQ5UUZL3zASFbTPAuOTsIm_06I3nOTl7ZA= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16xtc8DntSukj8vsyLgn2hkAWj5uB7uOY1WmEkcVrbyruc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -4834,7 +4854,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -4844,66 +4864,66 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107136 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 107136 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 64 +- MacroTileA: 256 +- MacroTileB: 64 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -4911,20 +4931,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 2 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -4933,13 +4953,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 22 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 21 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -4951,17 +4971,17 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -4977,16 +4997,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5005,9 +5025,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5016,24 +5036,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32sIYQBRMQTu2cNGVkFJ2d6Z21s8bopjK0ST4NqGoYOu4= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x7OspsTfPDQIdQksds3lUMP8NTU9zA0603OZ0JO7T7AA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5046,7 +5067,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -5058,7 +5079,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -5068,38 +5089,38 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 123776 ++ LdsBytesNoAmax: 36864 + LdsInitCVgprs: false +- LdsNumBytes: 123776 +- LdsNumElementsAlignedA: 24960 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 24960 +- LdsOffsetB_Blk: 90496 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 24960 +- LdsOffsetMetadata_Blk: 90496 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5107,27 +5128,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 +- MacroTile1: 256 ++ MacroTile1: 64 + MacroTileA: 192 +- MacroTileB: 256 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5135,20 +5156,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 192 +- NumGlobalWriteVectorsPerThread: 192 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 6 +- NumLoadsB: 8 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5157,13 +5178,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 23 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 22 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5172,20 +5193,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 4 +- ThreadTileA: 48 +- ThreadTileB: 4 ++ ThreadTile0: 24 ++ ThreadTile1: 2 ++ ThreadTileA: 24 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5200,17 +5221,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5231,7 +5252,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5240,24 +5261,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32ZQn8HhYTItqIu_H-IdzY973uHhvp5AroTVvguoMGRCo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16xv2Ltx4sWSsTOQXvxoTuwAZ1kjwYR_FY0x7uZ7R0slhc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5270,7 +5292,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -5282,7 +5304,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -5292,26 +5314,26 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 55296 ++ LdsBytesNoAmax: 32768 + LdsInitCVgprs: false +- LdsNumBytes: 55296 ++ LdsNumBytes: 32768 + LdsNumElementsAlignedA: 27648 +- LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 ++ LdsOffsetA_Blk: 32768 + LdsOffsetB: 27648 +- LdsOffsetB_Blk: 93184 ++ LdsOffsetB_Blk: 60416 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 55296 +- LdsOffsetMetadata_Blk: 93184 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 60416 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -5319,11 +5341,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5331,26 +5353,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 192 +- MacroTile1: 192 ++ MacroTile1: 32 + MacroTileA: 192 +- MacroTileB: 192 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -5359,20 +5381,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 6 +- NumLoadsB: 6 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5381,13 +5403,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 24 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 23 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5396,20 +5418,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5424,17 +5446,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5455,7 +5477,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5464,31 +5486,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x32_MI32K9Hlf4MV7tbsnueQ5i6ghOjfjaPgd12shTFnEt-DzFE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16gig7lHaQwi1Ma9bePZP0eHGuvTr1GOvKpK4y0pjkFxI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -5506,48 +5529,48 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 8 ++ LSPA: 32 + LSPB: 32 +- LVCA: 32 ++ LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 108288 ++ LdsBytesNoAmax: 125952 + LdsInitCVgprs: false +- LdsNumBytes: 108288 +- LdsNumElementsAlignedA: 26112 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 125952 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 26112 +- LdsOffsetB_Blk: 91648 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 26112 +- LdsOffsetMetadata_Blk: 91648 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5555,27 +5578,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [5, 8] ++ MIWaveTileA: 5 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 160 ++ MacroTile1: 256 ++ MacroTileA: 160 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -5583,20 +5606,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 24 +- NumLoadsB: 4 ++ NumElementsPerThread: 160 ++ NumGlobalWriteVectorsPerThread: 160 ++ NumLoadsA: 5 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 24 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5605,13 +5628,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 25 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 24 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5623,17 +5646,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 8 ++ ThreadTileA: 20 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5649,16 +5672,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5677,9 +5700,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 1 ++ - 1LDSBuffer: 0 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5688,24 +5711,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32xXCG0J0tKNv8mO5VYXImXIQNh4C5k0bjrV4GZExVtFc8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16XHarJQzv5VjKJsS-aArjh9WILj7mNfIquP3unX6K00o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5730,7 +5754,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -5743,23 +5767,23 @@ + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 36864 ++ LdsBytesNoAmax: 126976 + LdsInitCVgprs: false +- LdsNumBytes: 36864 +- LdsNumElementsAlignedA: 27648 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 126976 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 27648 +- LdsOffsetB_Blk: 93184 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 36864 +- LdsOffsetMetadata_Blk: 93184 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 25600 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -5767,11 +5791,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -5779,26 +5803,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 +- MIWaveTileB: 1 ++ MIWaveTile: [5, 7] ++ MIWaveTileA: 5 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 160 ++ MacroTile1: 224 ++ MacroTileA: 160 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -5807,20 +5831,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 6 +- NumLoadsB: 2 ++ NumElementsPerThread: 140 ++ NumGlobalWriteVectorsPerThread: 140 ++ NumLoadsA: 5 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 6 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -5829,13 +5853,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 26 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 25 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x224x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -5847,17 +5871,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 1 +- ThreadTileA: 48 +- ThreadTileB: 1 ++ ThreadTile0: 20 ++ ThreadTile1: 7 ++ ThreadTileA: 20 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -5879,10 +5903,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -5903,7 +5927,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -5912,24 +5936,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32YSPlokb_vFYskbarypNn6uankuWE0IUvabRJ4eyv2Zs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16MM6yXrHkKNevAvGOCUWm6AIYUGyuR0Gorw-MvHcpIrI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -5942,7 +5967,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -5954,7 +5979,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -5964,26 +5989,26 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 51712 ++ LdsBytesNoAmax: 53248 + LdsInitCVgprs: false +- LdsNumBytes: 51712 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 34816 ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 51712 +- LdsOffsetMetadata_Blk: 82432 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 91136 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -5991,38 +6016,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 2 ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 6] ++ MIWaveTileA: 5 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 160 ++ MacroTile1: 192 ++ MacroTileA: 160 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -6031,20 +6056,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 4 +- NumLoadsB: 8 ++ NumElementsPerThread: 120 ++ NumGlobalWriteVectorsPerThread: 120 ++ NumLoadsA: 5 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6053,13 +6078,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 27 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 26 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6068,20 +6093,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 6 ++ ThreadTileA: 20 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6096,7 +6121,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -6105,8 +6130,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6125,9 +6150,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6136,7 +6161,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16d9LrbNNi5b3TXEDFla_nBJIIQr4eZMjULrCgtdUB9gU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16HgxGrym_mMtiR0mv-GJCl4sJI_qyDdeH-_dmA4ihuFg= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -6147,13 +6172,14 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -6166,7 +6192,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -6178,7 +6204,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -6188,24 +6214,24 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107776 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 107776 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 25344 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6213,8 +6239,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -6226,15 +6252,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [8, 3] +- MIWaveTileA: 8 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 5] ++ MIWaveTileA: 5 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 160 ++ MacroTile1: 160 ++ MacroTileA: 160 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6246,8 +6272,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6255,20 +6281,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 24 +- NumLoadsA: 4 +- NumLoadsB: 6 ++ NumElementsPerThread: 100 ++ NumGlobalWriteVectorsPerThread: 100 ++ NumLoadsA: 5 ++ NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6276,14 +6302,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 28 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 27 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6292,20 +6318,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 3 +- ThreadTileA: 32 +- ThreadTileB: 3 ++ ThreadTile0: 20 ++ ThreadTile1: 5 ++ ThreadTileA: 20 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6320,17 +6346,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6349,9 +6375,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6360,24 +6386,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16ac1MeF93CRji3aw9JRkhlETVYNbVM3aM2yceh-P8FCM= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16B8mpan0CCYfQ8Dp-s81y1Tqx53HM44CCuAoqQOFi9XU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -6390,7 +6417,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -6402,7 +6429,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -6412,24 +6439,24 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6437,8 +6464,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -6451,13 +6478,13 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 ++ MacroTile0: 160 + MacroTile1: 128 +- MacroTileA: 128 ++ MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 +@@ -6470,8 +6497,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6479,19 +6506,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -6500,14 +6527,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 +- PrefetchLocalRead: 1 ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 29 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 28 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6516,19 +6543,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 20 + ThreadTile1: 4 +- ThreadTileA: 16 ++ ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -6544,7 +6571,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -6553,8 +6580,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6573,9 +6600,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6584,7 +6611,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x1VFeE9PS51mSX3gf7E4sVQrWjf5Zqm-e4RDCGzFtx74= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16xHAmI-YI0vdXe4AZ0Ugc541yWh0Tqtnok4mBECNr_YnA= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -6595,26 +6622,27 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -6626,34 +6654,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 59392 ++ LdsBytesNoAmax: 40960 + LdsInitCVgprs: false +- LdsNumBytes: 59392 +- LdsNumElementsAlignedA: 17408 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 40960 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 17408 +- LdsOffsetB_Blk: 50176 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 50176 ++ LdsOffsetMetadata: 40960 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6675,14 +6703,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6694,7 +6722,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -6703,20 +6731,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 5 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6724,14 +6752,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 30 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 29 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6740,20 +6768,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6768,8 +6796,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -6777,8 +6805,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -6797,9 +6825,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -6808,7 +6836,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16xLuvRzQQ4l9YfTrQPTiczIa8N2UU3Bw9A3r9q0UaSeKo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16xzCr4Q2iJY3sf01yFVY4zsbm9JJlBV3Qk5HaaPwvuKmo= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -6819,26 +6847,27 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -6850,34 +6879,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 8 ++ LSPA: 32 + LSPB: 32 +- LVCA: 32 ++ LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 108032 ++ LdsBytesNoAmax: 34816 + LdsInitCVgprs: false +- LdsNumBytes: 108032 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 74240 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 91136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 +- LdsOffsetMetadata_Blk: 74240 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -6885,8 +6914,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -6898,15 +6927,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 256 +- MacroTileA: 64 +- MacroTileB: 256 ++ MacroTile0: 160 ++ MacroTile1: 64 ++ MacroTileA: 160 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -6918,8 +6947,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -6927,20 +6956,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 5 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -6948,14 +6977,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 31 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 30 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -6964,20 +6993,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 4 +- ThreadTileA: 16 +- ThreadTileB: 4 ++ ThreadTile0: 20 ++ ThreadTile1: 2 ++ ThreadTileA: 20 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -6992,17 +7021,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthA: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7023,7 +7052,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7032,7 +7061,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x32_MI16xJ19uAh2j9-0EkF6-eCu5fBFywOghcfgktiCzGMfySRk= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16xkOJdz6SrHvk7WEdrywZ8qqKxOAMxc4U_8ChrjhF39bA= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -7043,26 +7072,27 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -7074,34 +7104,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 8 ++ LSPA: 32 + LSPB: 32 +- LVCA: 32 ++ LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 39424 ++ LdsBytesNoAmax: 30720 + LdsInitCVgprs: false +- LdsNumBytes: 39424 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 30720 ++ LdsNumBytes: 30720 ++ LdsNumElementsAlignedA: 25600 ++ LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 74240 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 25600 ++ LdsOffsetB_Blk: 58368 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 39424 +- LdsOffsetMetadata_Blk: 74240 ++ LdsOffsetMetadata: 30720 ++ LdsOffsetMetadata_Blk: 58368 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -7122,15 +7152,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -7142,7 +7172,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7151,20 +7181,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 12 +- NumLoadsA: 8 +- NumLoadsB: 6 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 5 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 6 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7178,8 +7208,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 32 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 31 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7188,20 +7218,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7216,17 +7246,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7245,9 +7275,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7256,7 +7286,7 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x-HtbfPN73qceBqBgeLq6fMIJZ_7IIcoTdNzOwvlf-M0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16go9x1cCSMS5Zs2gNIqb6lZSaO9pQGj7tbR0M5r8KR90= + BufferLoad: true + BufferStore: true + CUCount: null +@@ -7267,26 +7297,27 @@ + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -7298,34 +7329,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 59392 ++ LdsBytesNoAmax: 52224 + LdsInitCVgprs: false +- LdsNumBytes: 59392 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 17408 ++ LdsNumBytes: 52224 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 41984 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 9216 +- LdsOffsetMetadata_Blk: 41984 ++ LdsOffsetMetadata: 52224 ++ LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -7347,14 +7378,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [2, 4] +- MIWaveTileA: 2 +- MIWaveTileB: 4 ++ MIWaveTile: [4, 8] ++ MIWaveTileA: 4 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 128 ++ MacroTile1: 256 ++ MacroTileA: 128 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -7366,7 +7397,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -7375,20 +7406,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 128 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7396,14 +7427,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 33 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 32 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7412,20 +7443,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 8 +- ThreadTile1: 4 +- ThreadTileA: 8 +- ThreadTileB: 4 ++ ThreadTile0: 16 ++ ThreadTile1: 8 ++ ThreadTileA: 16 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7440,7 +7471,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -7449,8 +7480,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 +@@ -7469,9 +7500,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7480,24 +7511,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x37N3p2O4fzN73nd8_YpHddH8Ji48qWqNVaRze5bqiDBg= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16qSDdfr1rwM0oGsO7fq0qGc9ElsgiRG2Ct6QEwmDkiEI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7510,7 +7542,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -7522,7 +7554,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 +@@ -7532,38 +7564,38 @@ + LVCB: 8 + LVPA: 8 + LVPB: 8 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 49408 ++ LdsBytesNoAmax: 53248 + LdsInitCVgprs: false +- LdsNumBytes: 49408 +- LdsNumElementsAlignedA: 8320 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 53248 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8320 +- LdsOffsetB_Blk: 41088 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8320 +- LdsOffsetMetadata_Blk: 41088 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 53248 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -7571,27 +7603,27 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 128 ++ MacroTile1: 224 ++ MacroTileA: 128 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -7599,20 +7631,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 2 +- NumLoadsB: 2 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 7 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -7621,13 +7653,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 34 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 33 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -7636,20 +7668,14195 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 7 ++ ThreadTileA: 16 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16JLNytsHX4XSdQAAO4xJ5BHS48VZSaP5pzcgUWAJQR4w= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45056 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 6] ++ MIWaveTileA: 4 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 192 ++ MacroTileA: 128 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 34 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 6 ++ ThreadTileA: 16 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16spGoIUhzs6cQe9O6kqyfSS09uRi-hGWGGJ8mzOOY0zQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 43008 ++ LdsInitCVgprs: false ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 160 ++ MacroTileA: 128 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 4 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 35 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16y2KM6CSH11qkK0wbEiiL_3_Z4bTs2AxAlRRVAylN_ns= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 34816 ++ LdsInitCVgprs: false ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 36 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 4 ++ ThreadTileA: 16 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16xcDalFRjrFf9LD1mVL7ZY-vwFRKMxVGBnn10ukgOUvQo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 32768 ++ LdsInitCVgprs: false ++ LdsNumBytes: 32768 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 96 ++ MacroTileA: 128 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 37 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16xUiBVJAzn4XJGPy7LLOcb2tQshty-zPdLQ7wCiXXS_7k= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26624 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 38 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 2 ++ ThreadTileA: 16 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16xIUOlXMKFasxHO056sn72xCneh9LJFflW0FftqB5Lv7Q= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22528 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22528 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22528 ++ LdsOffsetMetadata_Blk: 50176 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 39 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16xk3fHPXJSfgHTOIuSvPM-2B2ynR5FE97-ani7tJlO60Q= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 50176 ++ LdsInitCVgprs: false ++ LdsNumBytes: 50176 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 50176 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 8] ++ MIWaveTileA: 3 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 256 ++ MacroTileA: 96 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 96 ++ NumGlobalWriteVectorsPerThread: 96 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 40 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 8 ++ ThreadTileA: 12 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16xTAcUSOPZZjoDIO38RttldDv4ll6XhTCk5bxFaonEhUU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 51200 ++ LdsInitCVgprs: false ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 7] ++ MIWaveTileA: 3 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 224 ++ MacroTileA: 96 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 84 ++ NumGlobalWriteVectorsPerThread: 84 ++ NumLoadsA: 3 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 41 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 7 ++ ThreadTileA: 12 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16xgNrG9lnRYTPl6nWeHr2U-ng5fhfJ5ORDgnd6p5PVo0E= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 43008 ++ LdsInitCVgprs: false ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 3 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 42 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16xjtzqONY7XYEtwp-aZmn9h-QHvQ9GtjwXm5zDYGpxxAE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40960 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40960 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 80896 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40960 ++ LdsOffsetMetadata_Blk: 80896 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 3 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 43 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x00LL_02B8CrCps3LiLiUOlZn3GVNi6qk0q_fOjTjj78= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 32768 ++ LdsInitCVgprs: false ++ LdsNumBytes: 32768 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 44 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x1QM_q7_EijTkJkAAQMbO6UUtrSCcF6ikTtOYFcqOJuOg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30720 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30720 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30720 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 3 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 45 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x1mSp4vfu3FMRgqDs-XpMTVSAIyCwYHAksvDhTk9U_VFk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 24576 ++ LdsInitCVgprs: false ++ LdsNumBytes: 24576 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 24576 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 3 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 46 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 2 ++ ThreadTileA: 12 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x1m_EsIPWhISlgppSVDfRFYFEIb7GQ5U7cE2-XWRmRtys= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 20480 ++ LdsInitCVgprs: false ++ LdsNumBytes: 20480 ++ LdsNumElementsAlignedA: 15360 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 15360 ++ LdsOffsetB_Blk: 48128 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 20480 ++ LdsOffsetMetadata_Blk: 48128 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 3 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 47 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16xRs19VY6H1Jb2SNx-bXza7u1las4YbNtWYPaWDlWl0fw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 44032 ++ LdsInitCVgprs: false ++ LdsNumBytes: 44032 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 44032 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 8] ++ MIWaveTileA: 2 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 256 ++ MacroTileA: 64 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 48 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 8 ++ ThreadTileA: 8 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16xnwitZsH2VfFX0m_wPVQkS8DIoDpJmR7JJYiW_bPVkBM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 45056 ++ LdsInitCVgprs: false ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 49 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16xrV1rClUhjCTx9w2WB3ew3QS204CcXACq782ndeA3Sik= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 36864 ++ LdsInitCVgprs: false ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 50 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16xlNxModXAn2BNDdLCB-wLODckJxm2hzLX9BpKkgmfBbg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 34816 ++ LdsInitCVgprs: false ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 51 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16xcxRU7X2FVgEGTrOOR0Jpruh7BkkfFCqtp8c3rKfRzbg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26624 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 4] ++ MIWaveTileA: 2 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 2 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 52 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 4 ++ ThreadTileA: 8 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x1vVIw0sKSDTh2e2Ldys8PXxhrIC6w430o9QuKD2j7fU0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 24576 ++ LdsInitCVgprs: false ++ LdsNumBytes: 24576 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 24576 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 3] ++ MIWaveTileA: 2 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 53 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 3 ++ ThreadTileA: 8 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x1LqhtmBhkr4XLV3HQ3R8ANhJaSJUBBrE5_0Y1Xv_38WM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 18432 ++ LdsInitCVgprs: false ++ LdsNumBytes: 18432 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 18432 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 2 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 54 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x1llwLmoUr4ZBbnBp4eADuX3gWfXYYCz7s3vdoIohLgkg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30720 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30720 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 25600 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 9216 ++ LdsOffsetMetadata_Blk: 25600 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 55 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x9BY6kqNx_mVd1gFwO79Gdw8CeCiQtkAARSuQ9FnwI9E= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 39936 ++ LdsInitCVgprs: false ++ LdsNumBytes: 39936 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 70656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 39936 ++ LdsOffsetMetadata_Blk: 70656 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 256 ++ MacroTileA: 32 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 56 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16xN-SC-7y2_vynHRPIAso7-wX09OsuZvS8LKA1v5EmctI= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 40960 ++ LdsInitCVgprs: false ++ LdsNumBytes: 40960 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 35840 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 70656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 40960 ++ LdsOffsetMetadata_Blk: 70656 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 7] ++ MIWaveTileA: 1 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 1 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 57 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16xyUqG7eW7LdoTgxmyHlo81ITemCgb9AF8nWEJbpMxNG0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 32768 ++ LdsInitCVgprs: false ++ LdsNumBytes: 32768 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 32768 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 1 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 58 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16xGGyITDQ352WVn8esAhU-ZG9qfxzBiAM7flCouwxyEMc= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30720 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30720 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 25600 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 30720 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 5] ++ MIWaveTileA: 1 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 1 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 59 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16xLIU0sWQ_vp-4jywpSmX7sFbsNfvOqs89ez-aJLeSYZE= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 22528 ++ LdsInitCVgprs: false ++ LdsNumBytes: 22528 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 22528 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 128 ++ MacroTileA: 32 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 60 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x1UMd59OKc_sUDj0wnWSnBmUTdgft2_xROBn6PGxO_sCI= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 20480 ++ LdsInitCVgprs: false ++ LdsNumBytes: 20480 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 15360 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 37888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 20480 ++ LdsOffsetMetadata_Blk: 37888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] ++ MIWaveTileA: 1 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 1 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 61 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x1UiZ3BcWrWVltQ6U-PiIVyq_-nKLMZ_3O8tVFCzYcqUw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 30720 ++ LdsInitCVgprs: false ++ LdsNumBytes: 30720 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 5120 ++ LdsOffsetMetadata_Blk: 21504 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] ++ MIWaveTileA: 1 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 64 ++ MacroTileA: 32 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 1 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 62 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x1aXBe76A1LqPYZsl5W42jQOqGA1tnTDWH6MsRatZeI5M= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 32 ++ LVCA: 8 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 26624 ++ LdsInitCVgprs: false ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 5120 ++ LdsNumElementsAlignedB: 5120 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 16384 ++ LdsOffsetB: 5120 ++ LdsOffsetB_Blk: 21504 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 5120 ++ LdsOffsetMetadata_Blk: 21504 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 1 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 63 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 1 ++ ThreadTileA: 4 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16VeY2KyOM6I7D09WdyUfkcJ2A3LqHJF7xeixlIPB6chs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73216 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73216 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 38400 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 165888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73216 ++ LdsOffsetMetadata_Blk: 165888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 15] ++ MIWaveTileA: 4 ++ MIWaveTileB: 15 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 240 ++ MacroTileA: 256 ++ MacroTileB: 240 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 8 ++ NumLoadsB: 15 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 15 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 64 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x240x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_15_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 15 ++ ThreadTileA: 16 ++ ThreadTileB: 15 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16ntmiLDTkYUHMME-V40JwKHWVJae_xom5TcGdd6utVUI= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 68096 ++ LdsInitCVgprs: false ++ LdsNumBytes: 68096 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 165888 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 68096 ++ LdsOffsetMetadata_Blk: 165888 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 13] ++ MIWaveTileA: 4 ++ MIWaveTileB: 13 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 208 ++ MacroTileA: 256 ++ MacroTileB: 208 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 52 ++ NumLoadsA: 8 ++ NumLoadsB: 13 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 13 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 65 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x208x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_13_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 13 ++ ThreadTileA: 16 ++ ThreadTileB: 13 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI162bF-MxfZiFg9kAS_19mm6vU_MbTjhqcozEOeMu9J0JA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 128512 ++ LdsInitCVgprs: false ++ LdsNumBytes: 128512 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 28160 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 11] ++ MIWaveTileA: 4 ++ MIWaveTileB: 11 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 176 ++ MacroTileA: 256 ++ MacroTileB: 176 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 44 ++ NumLoadsA: 8 ++ NumLoadsB: 11 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 11 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 66 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x176x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_11_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 11 ++ ThreadTileA: 16 ++ ThreadTileB: 11 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16MfclCQMsl9QDPOaUF0kNntalSg0l5SL5g7jNYvWS3gQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 123392 ++ LdsInitCVgprs: false ++ LdsNumBytes: 123392 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 23040 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 9] ++ MIWaveTileA: 4 ++ MIWaveTileB: 9 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 144 ++ MacroTileA: 256 ++ MacroTileB: 144 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 8 ++ NumLoadsB: 9 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 9 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 67 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x144x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 9 ++ ThreadTileA: 16 ++ ThreadTileB: 9 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16s2V_LPOoaCmhck7yUvRAtngZOQzqv5XY9638HWrfiM4= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 52736 ++ LdsInitCVgprs: false ++ LdsNumBytes: 52736 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 17920 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 52736 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 7] ++ MIWaveTileA: 4 ++ MIWaveTileB: 7 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 112 ++ MacroTileA: 256 ++ MacroTileB: 112 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 8 ++ NumLoadsB: 7 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 7 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 68 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 7 ++ ThreadTileA: 16 ++ ThreadTileB: 7 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16xdmy-Pv3z--9tLov6Mh7ItqFdf0ysrkNoOmmd3L72vok= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 47616 ++ LdsInitCVgprs: false ++ LdsNumBytes: 47616 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 12800 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 47616 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 80 ++ MacroTileA: 256 ++ MacroTileB: 80 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 5 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 5 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 69 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x80x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16xh5Qnw-fCkMlp5m-LoUYOr6w58PFaF5hhtcVv7BxBzPQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 42496 ++ LdsInitCVgprs: false ++ LdsNumBytes: 42496 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 7680 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 42496 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 3 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 3 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 70 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16xcjJrxVmACWz7wQmr1OJ2XXHzWZGUwStUiMEQdZMLa24= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 2 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 32 ++ LSPB: 16 ++ LVCA: 8 ++ LVCB: 16 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 37376 ++ LdsInitCVgprs: false ++ LdsNumBytes: 37376 ++ LdsNumElementsAlignedA: 34816 ++ LdsNumElementsAlignedB: 2560 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 37376 ++ LdsOffsetMetadata_Blk: 100352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 1 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 1 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 71 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [64, 4, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16hN4XrV88x4XvjenBWKo6spqe6AYtPIF4I3GhbZ21LWY= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73216 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73216 ++ LdsNumElementsAlignedA: 38400 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 38400 ++ LdsOffsetB_Blk: 169472 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73216 ++ LdsOffsetMetadata_Blk: 169472 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [15, 4] ++ MIWaveTileA: 15 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 240 ++ MacroTile1: 256 ++ MacroTileA: 240 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 240 ++ NumGlobalWriteVectorsPerThread: 240 ++ NumLoadsA: 15 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 15 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 72 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT240x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT15_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 60 ++ ThreadTile1: 4 ++ ThreadTileA: 60 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI166mLbzp-opmiZo5Ayr3ObW7hS2LyJsa6P5gDooT2AkNo= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 68096 ++ LdsInitCVgprs: false ++ LdsNumBytes: 68096 ++ LdsNumElementsAlignedA: 33280 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33280 ++ LdsOffsetB_Blk: 164352 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 68096 ++ LdsOffsetMetadata_Blk: 164352 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [13, 4] ++ MIWaveTileA: 13 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 208 ++ MacroTile1: 256 ++ MacroTileA: 208 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 208 ++ NumGlobalWriteVectorsPerThread: 208 ++ NumLoadsA: 13 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 13 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 73 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT208x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT13_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 52 ++ ThreadTile1: 4 ++ ThreadTileA: 52 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16ecEYhxoP4bNZbfLkRRpDuXp7K_4y6Rfkhn0marzz_Fg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 128512 ++ LdsInitCVgprs: false ++ LdsNumBytes: 128512 ++ LdsNumElementsAlignedA: 28160 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 28160 ++ LdsOffsetB_Blk: 93696 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 28160 ++ LdsOffsetMetadata_Blk: 93696 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [11, 4] ++ MIWaveTileA: 11 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 176 ++ MacroTile1: 256 ++ MacroTileA: 176 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 176 ++ NumGlobalWriteVectorsPerThread: 176 ++ NumLoadsA: 11 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 11 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 74 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT176x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 44 ++ ThreadTile1: 4 ++ ThreadTileA: 44 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 0 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16AHdf7LFDV-fECuTEjK7cMae6CDCgEKj-dewOiwEoeOQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 123392 ++ LdsInitCVgprs: false ++ LdsNumBytes: 123392 ++ LdsNumElementsAlignedA: 23040 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 23040 ++ LdsOffsetB_Blk: 88576 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 23040 ++ LdsOffsetMetadata_Blk: 88576 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [9, 4] ++ MIWaveTileA: 9 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 144 ++ MacroTile1: 256 ++ MacroTileA: 144 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 144 ++ NumGlobalWriteVectorsPerThread: 144 ++ NumLoadsA: 9 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 9 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 75 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 36 ++ ThreadTile1: 4 ++ ThreadTileA: 36 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x3NjwYtDqtiqy15hv3CbwR3Zxg-bUv6UFn621mvF7Bg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 52736 ++ LdsInitCVgprs: false ++ LdsNumBytes: 52736 ++ LdsNumElementsAlignedA: 17920 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17920 ++ LdsOffsetB_Blk: 83456 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 52736 ++ LdsOffsetMetadata_Blk: 83456 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [7, 4] ++ MIWaveTileA: 7 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 112 ++ MacroTile1: 256 ++ MacroTileA: 112 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 112 ++ NumGlobalWriteVectorsPerThread: 112 ++ NumLoadsA: 7 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 7 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 76 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT112x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 4 ++ ThreadTileA: 28 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16xVfG4yKGZhRUhpuUGW12r6bNldhnUUj_saqhqvEv_RXk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 47616 ++ LdsInitCVgprs: false ++ LdsNumBytes: 47616 ++ LdsNumElementsAlignedA: 12800 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 12800 ++ LdsOffsetB_Blk: 78336 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 47616 ++ LdsOffsetMetadata_Blk: 78336 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 80 ++ MacroTile1: 256 ++ MacroTileA: 80 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 5 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 5 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 77 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x9oxiqx5PReupUBd8k-OQeMjVk0fsa4UX8fgFoE8ZPtU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 42496 ++ LdsInitCVgprs: false ++ LdsNumBytes: 42496 ++ LdsNumElementsAlignedA: 7680 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 7680 ++ LdsOffsetB_Blk: 73216 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 42496 ++ LdsOffsetMetadata_Blk: 73216 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 3 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 78 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16xmOi_kuGXpQ6o0qlcOunHB9j4yZTUmcs5QtrajeVVHDA= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 0 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 32 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 2 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ LDSTrInst: false ++ LSCA: 32 ++ LSCB: 32 ++ LSPA: 16 ++ LSPB: 32 ++ LVCA: 16 ++ LVCB: 8 ++ LVPA: 8 ++ LVPB: 8 ++ LdsBlockSizePerPadA: 128 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 37376 ++ LdsInitCVgprs: false ++ LdsNumBytes: 37376 ++ LdsNumElementsAlignedA: 2560 ++ LdsNumElementsAlignedB: 34816 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 2560 ++ LdsOffsetB_Blk: 68096 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 37376 ++ LdsOffsetMetadata_Blk: 68096 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 1 ++ LoopUnroll: 32 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 1 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 0 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 79 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 128 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 4 ++ SubGroup1: 64 ++ SubGroupA: 4 ++ SubGroupB: 64 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [16, 16, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 32 ++ _DepthUA: 32 ++ _DepthUB: 32 ++ _DepthUMetadata: 32 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x7Z_ZSdTdNPNspjJ86wNSQIrH7a8t3aCYDd-i4kd7yOI= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 76800 ++ LdsInitCVgprs: false ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 198656 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [8, 1] ++ MIWaveTileA: 8 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 256 ++ MacroTile1: 32 ++ MacroTileA: 256 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 16 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 80 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 32 ++ ThreadTile1: 1 ++ ThreadTileA: 32 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16xLDnq_xObwEpdCAXqPgQoR-GHHzMJfWwIRhL0EwnkLuQ= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 81920 ++ LdsInitCVgprs: false ++ LdsNumBytes: 81920 ++ LdsNumElementsAlignedA: 64512 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 64512 ++ LdsOffsetB_Blk: 195584 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 81920 ++ LdsOffsetMetadata_Blk: 195584 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 2] ++ MIWaveTileA: 7 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 64 ++ MacroTileA: 224 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 56 ++ NumLoadsA: 14 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 14 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 81 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 2 ++ ThreadTileA: 28 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16xHT4eBlJWlmPEGANjQz9ili_zDQPgSnCqtjnplpJ2Ntk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73728 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 64512 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 64512 ++ LdsOffsetB_Blk: 195584 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 195584 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [7, 1] ++ MIWaveTileA: 7 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 224 ++ MacroTile1: 32 ++ MacroTileA: 224 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 14 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 14 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 82 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 28 ++ ThreadTile1: 1 ++ ThreadTileA: 28 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16xaYegA5F1z4adL3WGKpfR7_sY0BWQNGI_UJMRCvLRzmk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 79872 ++ LdsInitCVgprs: false ++ LdsNumBytes: 79872 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 79872 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 3] ++ MIWaveTileA: 6 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 96 ++ MacroTileA: 192 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 12 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 83 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 3 ++ ThreadTileA: 24 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16xDl2wXS2xHwVdOVWzgbAvw49O7WeMbHj6nScO1Hrw14s= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 69632 ++ LdsInitCVgprs: false ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 2] ++ MIWaveTileA: 6 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 64 ++ MacroTileA: 192 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 12 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 84 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 2 ++ ThreadTileA: 24 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16xIk0X9lopBBekN51cTVtc_45GOQUVV3p7pDf3zrq53wk= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 2 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 61440 ++ LdsInitCVgprs: false ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 117760 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 117760 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [6, 1] ++ MIWaveTileA: 6 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 192 ++ MacroTile1: 32 ++ MacroTileA: 192 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 85 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT192x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 2 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 24 ++ ThreadTile1: 1 ++ ThreadTileA: 24 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 2 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16YXptMo9ba5D-mJwTZomP1jURSFlJFbfzxYAO9KKNhHM= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 79872 ++ LdsInitCVgprs: false ++ LdsNumBytes: 79872 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 33792 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 177152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 79872 ++ LdsOffsetMetadata_Blk: 177152 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 4] ++ MIWaveTileA: 5 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 128 ++ MacroTileA: 160 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 80 ++ NumLoadsA: 10 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 86 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 4 ++ ThreadTileA: 20 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16xWyTZ3mbSQgGzjVLx5n7I4jITIP5mWMVwr_wP1cvPKJs= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73728 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 177152 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 177152 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 3] ++ MIWaveTileA: 5 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 96 ++ MacroTileA: 160 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 10 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 87 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 3 ++ ThreadTileA: 20 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16xXFN8IgGYslLjruiMQtaIsOhpx_-TDqfo-u5MQxpnWOI= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 63488 ++ LdsInitCVgprs: false ++ LdsNumBytes: 63488 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 111616 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 63488 ++ LdsOffsetMetadata_Blk: 111616 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 2] ++ MIWaveTileA: 5 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 64 ++ MacroTileA: 160 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 40 ++ NumLoadsA: 10 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 88 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 2 ++ ThreadTileA: 20 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16xPPxw7fqKz78jlgoSj2rBksF3gMRi8bHqXY8xgtVLR0Q= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 55296 ++ LdsInitCVgprs: false ++ LdsNumBytes: 55296 ++ LdsNumElementsAlignedA: 46080 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 46080 ++ LdsOffsetB_Blk: 111616 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 55296 ++ LdsOffsetMetadata_Blk: 111616 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [5, 1] ++ MIWaveTileA: 5 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 160 ++ MacroTile1: 32 ++ MacroTileA: 160 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 10 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 10 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 89 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT160x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 20 ++ ThreadTile1: 1 ++ ThreadTileA: 20 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16jaI-7xmo7vjhvPFt213tdg1N2KJJ0S5RFGf80PnFMz8= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 79872 ++ LdsInitCVgprs: false ++ LdsNumBytes: 79872 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 46080 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 79872 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 5] ++ MIWaveTileA: 4 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 160 ++ MacroTileA: 128 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 80 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 8 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 10 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 90 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 5 ++ ThreadTileA: 16 ++ ThreadTileB: 5 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16XNhVNBLMp3fVSafONlZ2w08ofNDFj4KLw96Je21fUXw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 67584 ++ LdsInitCVgprs: false ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 33792 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 4] ++ MIWaveTileA: 4 ++ MIWaveTileB: 4 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 128 ++ MacroTileA: 128 ++ MacroTileB: 128 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 64 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 8 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 91 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 4 ++ ThreadTileA: 16 ++ ThreadTileB: 4 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 4 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16xfqXZ5wkUjj0aXzMl_pMUxdhzOm4UWdAY7qvGIJKYIVU= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 61440 ++ LdsInitCVgprs: false ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 27648 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 96 ++ MacroTileA: 128 ++ MacroTileB: 96 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 8 ++ NumLoadsB: 6 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 92 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16xoa3DTDLs6JtyGYjNrlnwdLDsBZqEkDu-ITfLZ7sEgsw= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 51200 ++ LdsInitCVgprs: false ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 17408 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 2] ++ MIWaveTileA: 4 ++ MIWaveTileB: 2 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 64 ++ MacroTileA: 128 ++ MacroTileB: 64 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 8 ++ NumLoadsB: 4 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 93 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 2 ++ ThreadTileA: 16 ++ ThreadTileB: 2 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16xhoCJtNSTNm61PyM0eozZXcORRhvZ0kCl_C_9OyJZOp0= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 4 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 43008 ++ LdsInitCVgprs: false ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 9216 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [4, 1] ++ MIWaveTileA: 4 ++ MIWaveTileB: 1 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 128 ++ MacroTile1: 32 ++ MacroTileA: 128 ++ MacroTileB: 32 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 2 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 2 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 94 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 4 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 16 ++ ThreadTile1: 1 ++ ThreadTileA: 16 ++ ThreadTileB: 1 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 4 ++ VectorWidthB: 1 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x0vmz7Pfe_Ddtk65fp0l7I6hyi8aCzJFtwvsLz_qNRlg= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 79872 ++ LdsInitCVgprs: false ++ LdsNumBytes: 79872 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 52224 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 158720 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 79872 ++ LdsOffsetMetadata_Blk: 158720 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 6] ++ MIWaveTileA: 3 ++ MIWaveTileB: 6 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 192 ++ MacroTileA: 96 ++ MacroTileB: 192 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 72 ++ NumGlobalWriteVectorsPerThread: 72 ++ NumLoadsA: 6 ++ NumLoadsB: 12 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 12 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 95 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 ++ SuppressNoLoadLoop: false ++ ThreadTile: [1, 1] ++ ThreadTile0: 12 ++ ThreadTile1: 6 ++ ThreadTileA: 12 ++ ThreadTileB: 6 ++ TransposeLDS: 1 ++ TransposeLDSMetadata: true ++ ULSGRODoubleG2L: 0 ++ UnrollLoopSwapGlobalReadOrder: 0 ++ UnrollMajorLDSA: true ++ UnrollMajorLDSB: true ++ UnrollMajorLDSMetadata: true ++ Use64bShadowLimit: 1 ++ UseDotInstruction: false ++ UseF32XEmulation: true ++ UseInstOffsetForGRO: 0 ++ UseSgprForGRO: 0 ++ Valid: true ++ VectorStore: -1 ++ VectorWidthA: 1 ++ VectorWidthB: 2 ++ WaveSeparateGlobalReadA: 0 ++ WaveSeparateGlobalReadB: 0 ++ WaveSeparateGlobalReadMetadata: 0 ++ WaveSplitK: false ++ WavefrontSize: 64 ++ WorkGroup: [32, 8, 1] ++ WorkGroupMapping: 6 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 ++ WorkGroupReduction: false ++ WorkspaceCheck: [4, 0, 0] ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 ++ _GlobalAccumulation: PartialsBuffer ++ _UseSgprForGRO: 0 ++ _VectorStore: 1 ++ _WorkspaceSizePerElemBias: 0 ++ _WorkspaceSizePerElemC: 4 ++ _staggerStrideShift: 0 ++ enableLDSTrA: false ++ enableLDSTrB: false ++ reorderGRInstForDTVA: false ++ reorderGRInstForDTVB: false ++ tailLoopOptA: false ++ tailLoopOptB: false ++ - 1LDSBuffer: 1 ++ ActivationAlt: false ++ ActivationFuncCall: true ++ ActivationFused: true ++ AssertAIGreaterThanEqual: -1 ++ AssertAILessThanEqual: -1 ++ AssertFree0ElementMultiple: 1 ++ AssertFree1ElementMultiple: 1 ++ AssertSummationElementMultiple: 1 ++ AssignedDerivedParameters: true ++ AssignedProblemIndependentDerivedParameters: true ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16xrxP2GQOrIekUuGWkP5E8_SBdZrK8T66yqKZLhK7rG4o= ++ BufferLoad: true ++ BufferStore: true ++ CUCount: null ++ CUOccupancy: -1 ++ ClusterLocalRead: 1 ++ CodeObjectVersion: '4' ++ ConvertAfterDS: false ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 64 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false ++ DirectToVgprA: false ++ DirectToVgprB: false ++ DirectToVgprSparseMetadata: false ++ EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false ++ EnableF32XdlMathOp: true ++ EnableMatrixInstruction: true ++ ExpandPointerSwap: 0 ++ ExpertSchedulingMode: 0 ++ ForceDisableShadowInit: false ++ GlobalReadPerMfma: 1 ++ GlobalReadVectorWidthA: 4 ++ GlobalReadVectorWidthB: 4 ++ GlobalSplitU: 0 ++ GlobalSplitUAlgorithm: MultipleBuffer ++ GlobalSplitUCoalesced: false ++ GlobalSplitUWorkGroupMappingRoundRobin: false ++ GlobalWriteVectorWidth: 1 ++ GroupLoadStore: false ++ GuaranteeNoPartialA: true ++ GuaranteeNoPartialB: true ++ GuaranteeNoPartialMetadata: true ++ ISA: [9, 5, 0] ++ InnerUnroll: 1 ++ InterleaveAlpha: 0 ++ InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, ++ SupportUserGSU: false, UseUniversalArgs: true} ++ Kernel: true ++ KernelLanguage: Assembly ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ LDSTrInst: false ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadMetadata: 0 ++ LdsBytesNoAmax: 73728 ++ LdsInitCVgprs: false ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 46080 ++ LdsNumElementsAlignedMetadata: 0 ++ LdsOffsetA: 0 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 158720 ++ LdsOffsetBias: 0 ++ LdsOffsetBiasGSU: 0 ++ LdsOffsetBiasNonGSU: 0 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 158720 ++ LdsPadA: 8 ++ LdsPadB: 8 ++ LdsPadMetadata: 0 ++ LocalReadVectorWidth: 4 ++ LocalSplitU: 1 ++ LocalSplitUReuseLDS: 1 ++ LocalWritePerMfma: -1 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 ++ LoopUnroll: 64 ++ MFMA_BF16_1K: false ++ MIArchVgpr: false ++ MIBlock: [16, 16, 32, 1, 1, 1] ++ MIInputPerThread: 8 ++ MIInputPerThreadA: 8 ++ MIInputPerThreadB: 8 ++ MIInputPerThreadMetadata: 8 ++ MIOutputVectorWidth: 4 ++ MIRegPerOut: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 5] ++ MIWaveTileA: 3 ++ MIWaveTileB: 5 ++ MIWaveTileMetadata: 0 ++ MacroTile0: 96 ++ MacroTile1: 160 ++ MacroTileA: 96 ++ MacroTileB: 160 ++ MagicDivAlg: 2 ++ MathClocksUnrolledLoop: 0 ++ MatrixInstB: 1 ++ MatrixInstBM: 1 ++ MatrixInstBN: 1 ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] ++ MaxLDS: 163840 ++ MaxOccupancy: 40 ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false ++ NoReject: false ++ NoTailLoop: false ++ NonDTLTailLoopA: false ++ NonDTLTailLoopB: false ++ NonTemporal: -1 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 ++ NonTemporalE: 0 ++ NonTemporalMetadata: 0 ++ NonTemporalWS: 0 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 60 ++ NumGlobalWriteVectorsPerThread: 60 ++ NumLoadsA: 6 ++ NumLoadsB: 10 ++ NumLoadsCoalescedA: 1 ++ NumLoadsCoalescedB: 1 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 10 ++ NumThreads: 256 ++ NumWaveSplitK: 1 ++ OptNoLoadLoop: 1 ++ PackedC0IdxChars: [I] ++ PackedC0IndicesX: [0] ++ PackedC1IdxChars: [J] ++ PackedC1IndicesX: [1] ++ PrefetchGlobalRead: 2 ++ PrefetchLocalRead: 1 ++ PreloadKernArgs: true ++ ScheduleGlobalRead: 1 ++ ScheduleIterAlg: 3 ++ ScheduleLocalWrite: 1 ++ SolutionIndex: 96 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 ++ SourceSwap: true ++ StaggerU: 0 ++ StaggerUMapping: 0 ++ StaggerUStride: 256 ++ StorePriorityOpt: false ++ StoreRemapVectorWidth: 0 ++ StoreSwapAddr: false ++ StoreSyncOpt: 0 ++ StoreVectorWidth: 1 ++ StreamK: 3 ++ StreamKAtomic: 0 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 5 ++ ThreadTileA: 12 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7671,16 +21878,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7693,9 +21900,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7704,24 +21911,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x3aSQK4KQvEtOMc8KSdilUkLdtSuGwM1F8kvfkUBx9iOM= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x0qRYdDfIBe7kSFS3TiIXrICFKQvboPz--K-LBY2srMI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7746,36 +21954,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 32 +- LSCB: 32 ++ LSCA: 64 ++ LSCB: 64 + LSPA: 16 + LSPB: 16 +- LVCA: 8 +- LVCB: 8 ++ LVCA: 16 ++ LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 30208 ++ LdsBytesNoAmax: 61440 + LdsInitCVgprs: false +- LdsNumBytes: 30208 +- LdsNumElementsAlignedA: 4608 +- LdsNumElementsAlignedB: 9216 ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 4608 +- LdsOffsetB_Blk: 20992 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 4608 +- LdsOffsetMetadata_Blk: 20992 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -7784,78 +21992,78 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 32 +- MacroTile1: 64 +- MacroTileA: 32 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 128 ++ MacroTileA: 96 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 2 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 48 ++ NumLoadsA: 6 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 2 +- NumLoadsPerpendicularB: 4 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 35 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 97 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -7863,17 +22071,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -7889,22 +22097,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -7919,7 +22127,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -7928,24 +22136,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x35Yilf3iGK4A4NnBWmUCs1pBmhSllanxvlK8JAzXtJPY= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x1pww0pw34kMqqEnZJd22QbDSavrIxIHQgsIDEab15l8I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 32 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -7970,36 +22179,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 32 +- LSCB: 32 ++ LSCA: 64 ++ LSCB: 64 + LSPA: 16 + LSPB: 16 +- LVCA: 8 +- LVCB: 8 ++ LVCA: 16 ++ LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 128 +- LdsBlockSizePerPadB: 128 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 13824 ++ LdsBytesNoAmax: 55296 + LdsInitCVgprs: false +- LdsNumBytes: 13824 +- LdsNumElementsAlignedA: 9216 +- LdsNumElementsAlignedB: 4608 ++ LdsNumBytes: 55296 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 16384 +- LdsOffsetB: 9216 +- LdsOffsetB_Blk: 25600 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 13824 +- LdsOffsetMetadata_Blk: 25600 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 55296 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8008,78 +22217,78 @@ + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 +- LoopUnroll: 32 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 3] ++ MIWaveTileA: 3 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 96 ++ MacroTile1: 96 ++ MacroTileA: 96 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 +- NumLoadsB: 2 ++ NumElementsPerThread: 36 ++ NumGlobalWriteVectorsPerThread: 36 ++ NumLoadsA: 6 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 2 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 36 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 98 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 128 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -8087,17 +22296,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 12 ++ ThreadTile1: 3 ++ ThreadTileA: 12 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8119,16 +22328,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 32 +- _DepthUA: 32 +- _DepthUB: 32 +- _DepthUMetadata: 32 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -8143,7 +22352,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8152,24 +22361,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32yf69Htkud3Ln43dNE6mHWsophcxrnrlaL1jb81cmOqs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x1rcBz0GRoIUMLF_VejIBtdIjOVeuajTMyHtY9N_6LGNk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8182,7 +22392,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -8194,7 +22404,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8204,26 +22414,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 100352 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 100352 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 197632 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 100352 +- LdsOffsetMetadata_Blk: 197632 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8231,11 +22441,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8243,26 +22453,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 ++ MIWaveTile: [3, 2] ++ MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 128 +- MacroTileA: 256 +- MacroTileB: 128 ++ MacroTile0: 96 ++ MacroTile1: 64 ++ MacroTileA: 96 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8271,20 +22481,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 6 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8298,8 +22508,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 37 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 99 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8308,19 +22518,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 ++ ThreadTile0: 12 + ThreadTile1: 2 +- ThreadTileA: 64 ++ ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -8336,17 +22546,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8367,7 +22577,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8376,24 +22586,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32xGQ4xMbvO_h6cVs0g__pli541keOh-3ThfABsSAWQgtU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x1ivssFeNll85795_fB6A823Tb8Fya0NMk1gJVo9H27Ss= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8406,7 +22617,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -8418,7 +22629,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8428,26 +22639,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 84480 ++ LdsBytesNoAmax: 36864 + LdsInitCVgprs: false +- LdsNumBytes: 84480 +- LdsNumElementsAlignedA: 67584 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 27648 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 67584 +- LdsOffsetB_Blk: 198656 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 27648 ++ LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 84480 +- LdsOffsetMetadata_Blk: 198656 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 93184 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8455,38 +22666,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 256 +- MacroTile1: 64 +- MacroTileA: 256 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8495,20 +22706,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 16 +- NumLoadsB: 4 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 6 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 6 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8522,8 +22733,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 38 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT256x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 100 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8532,20 +22743,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 12 ++ ThreadTile1: 1 ++ ThreadTileA: 12 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8560,17 +22771,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [128, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8591,7 +22802,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8600,24 +22811,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32UiGkHwW7heTbRL3k7AZGC7Eq0I2UVZeU9yMddTLPY0Q= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16xxloAMP11SCvA3AK4fftsd0JKKlpJRZfgCCsIh6ev_cQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8630,7 +22842,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -8642,7 +22854,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8652,26 +22864,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 104448 ++ LdsBytesNoAmax: 81920 + LdsInitCVgprs: false +- LdsNumBytes: 104448 +- LdsNumElementsAlignedA: 52224 +- LdsNumElementsAlignedB: 52224 ++ LdsNumBytes: 81920 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 52224 +- LdsOffsetB_Blk: 183296 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 104448 +- LdsOffsetMetadata_Blk: 183296 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 81920 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8679,11 +22891,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8691,26 +22903,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 3] +- MIWaveTileA: 3 +- MIWaveTileB: 3 ++ MIWaveTile: [2, 7] ++ MIWaveTileA: 2 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 192 +- MacroTileA: 192 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 224 ++ MacroTileA: 64 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8718,21 +22930,21 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 144 +- NumGlobalWriteVectorsPerThread: 144 +- NumLoadsA: 12 +- NumLoadsB: 12 ++ NumElementsPerThread: 56 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 4 ++ NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8746,8 +22958,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 39 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 101 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8756,20 +22968,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 3 +- ThreadTileA: 48 +- ThreadTileB: 3 ++ ThreadTile0: 8 ++ ThreadTile1: 7 ++ ThreadTileA: 8 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -8784,17 +22996,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -8815,7 +23027,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -8824,24 +23036,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32my91HJmhVCXmPlwcrAtKBpRTnXvfxbQK5xFOD7qb4_k= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16xLlSY6qu384I_OEldvszfzqaAHBVWIwU47E4MAEHLsaY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -8854,7 +23067,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -8866,7 +23079,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -8876,26 +23089,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 86016 ++ LdsBytesNoAmax: 69632 + LdsInitCVgprs: false +- LdsNumBytes: 86016 +- LdsNumElementsAlignedA: 52224 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 52224 +- LdsOffsetB_Blk: 183296 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 86016 +- LdsOffsetMetadata_Blk: 183296 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -8903,11 +23116,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -8915,26 +23128,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 2] +- MIWaveTileA: 3 +- MIWaveTileB: 2 ++ MIWaveTile: [2, 6] ++ MIWaveTileA: 2 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 128 +- MacroTileA: 192 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 192 ++ MacroTileA: 64 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -8943,20 +23156,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 96 +- NumLoadsA: 12 +- NumLoadsB: 8 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 4 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -8970,8 +23183,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 40 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 102 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -8980,20 +23193,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 +- ThreadTile1: 2 +- ThreadTileA: 48 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 6 ++ ThreadTileA: 8 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9008,17 +23221,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9039,7 +23252,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9048,24 +23261,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16xDi0TNsnNEaYS7CxxQiss7Aei6AtRfkqebdnhCWHkVIs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16xN3-w-xlFTtqWw8MhOqCn9Jpi_TCGLUu6CvBDaq8lScI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9078,7 +23292,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -9090,7 +23304,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9100,24 +23314,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 72192 ++ LdsBytesNoAmax: 63488 + LdsInitCVgprs: false +- LdsNumBytes: 72192 +- LdsNumElementsAlignedA: 55296 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 63488 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 55296 +- LdsOffsetB_Blk: 186368 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 72192 +- LdsOffsetMetadata_Blk: 186368 ++ LdsOffsetMetadata: 63488 ++ LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -9138,15 +23352,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [4, 1] +- MIWaveTile: [3, 4] +- MIWaveTileA: 3 +- MIWaveTileB: 4 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 5] ++ MIWaveTileA: 2 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 160 ++ MacroTileA: 64 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -9158,7 +23372,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9167,20 +23381,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 12 +- NumLoadsB: 4 ++ NumElementsPerThread: 40 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 4 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 12 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9194,8 +23408,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 41 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 103 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9204,20 +23418,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 16 +- SubGroup1: 16 +- SubGroupA: 16 +- SubGroupB: 16 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 12 +- ThreadTile1: 4 +- ThreadTileA: 12 +- ThreadTileB: 4 ++ ThreadTile0: 8 ++ ThreadTile1: 5 ++ ThreadTileA: 8 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9232,17 +23446,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9263,7 +23477,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9272,24 +23486,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32bcFXyGgmndLRG6ahRKsXGowcN-HP6iBlPZOkr-gWXsU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16xqT-7cHBxNGhk7her4QO0LUsKDXfj4usISRZod6NoCnI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9302,7 +23517,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -9314,7 +23529,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9324,26 +23539,26 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 512 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 100864 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 100864 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 67584 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 164352 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 100864 +- LdsOffsetMetadata_Blk: 164352 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9351,38 +23566,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 4] ++ MIWaveTileA: 2 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 256 +- MacroTileA: 128 +- MacroTileB: 256 ++ MacroTile0: 64 ++ MacroTile1: 128 ++ MacroTileA: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9391,20 +23606,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 128 +- NumGlobalWriteVectorsPerThread: 32 +- NumLoadsA: 8 +- NumLoadsB: 16 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9418,8 +23633,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 42 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 104 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9428,20 +23643,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 128 +- SubGroupA: 2 +- SubGroupB: 128 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 64 +- ThreadTile1: 2 +- ThreadTileA: 64 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 4 ++ ThreadTileA: 8 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9456,8 +23671,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -9465,8 +23680,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9487,7 +23702,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9496,24 +23711,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32ae-yz_xAR0mdeZubKGtBC2oB5dySwj_2AwSv0oe4iXA= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x1d9jrWlJVq7Bu6FCQuhV_3Q_0zErW0aO7Umat5aewnBA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9538,7 +23754,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9551,23 +23767,23 @@ + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 86016 ++ LdsBytesNoAmax: 45056 + LdsInitCVgprs: false +- LdsNumBytes: 86016 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 52224 ++ LdsNumBytes: 45056 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 164864 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 86016 +- LdsOffsetMetadata_Blk: 164864 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 45056 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -9575,11 +23791,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -9591,22 +23807,22 @@ + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 192 +- MacroTileA: 128 +- MacroTileB: 192 ++ MacroTile0: 64 ++ MacroTile1: 96 ++ MacroTileA: 64 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -9615,20 +23831,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 96 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 12 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 4 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9642,8 +23858,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 43 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x192x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 105 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -9655,16 +23871,16 @@ + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 ++ ThreadTile0: 8 + ThreadTile1: 3 +- ThreadTileA: 32 ++ ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -9687,10 +23903,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9709,9 +23925,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9720,24 +23936,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI163lhZq5v7F147Hu4DJ6pZtCp2ziHdvFiqR2atYlda_EU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x1bTqtkrC3zY6775rAVPE-aHA61dWWmgjWDENFFWp1M5Q= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9750,7 +23967,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -9762,7 +23979,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9772,24 +23989,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 135168 ++ LdsBytesNoAmax: 34816 + LdsInitCVgprs: false +- LdsNumBytes: 135168 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 67584 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 101376 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 101376 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -9797,8 +24014,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -9811,14 +24028,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 64 ++ MacroTile1: 64 ++ MacroTileA: 64 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -9830,8 +24047,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -9839,20 +24056,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -9866,30 +24083,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 44 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 106 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 4 +- ThreadTileA: 16 +- ThreadTileB: 4 ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -9904,8 +24121,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 4 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -9913,8 +24130,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -9933,9 +24150,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -9944,24 +24161,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x9fRi-YFoSH_NDTbp24tqxziMaMOSSI-UaJ682B9qju0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x1z3_ymyqCAF8cDXDoHRlMZ7aLBxxN1NeevpjXL0HLmSk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -9974,7 +24192,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -9986,7 +24204,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -9996,24 +24214,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 116224 ++ LdsBytesNoAmax: 26624 + LdsInitCVgprs: false +- LdsNumBytes: 116224 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 16896 ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 99328 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33792 +- LdsOffsetMetadata_Blk: 99328 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -10021,8 +24239,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10035,14 +24253,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 64 ++ MacroTile1: 32 ++ MacroTileA: 64 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10054,8 +24272,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10063,20 +24281,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 8 +- NumLoadsB: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 4 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10084,14 +24302,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 45 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 107 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10100,20 +24318,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 8 ++ ThreadTile1: 1 ++ ThreadTileA: 8 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10128,8 +24346,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -10137,8 +24355,8 @@ + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10159,7 +24377,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10168,24 +24386,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x64_MI16xPhJznBYJsRqTrgkGXQSZEKGWq5nPhiI8CIGLTIT3MnE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16xakZaiUCBkGpOuVOYgvqdrSCNMflQ9le0Xh_gr0R3xXc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10198,7 +24417,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -10210,7 +24429,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -10220,24 +24439,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 84480 ++ LdsBytesNoAmax: 76800 + LdsInitCVgprs: false +- LdsNumBytes: 84480 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 76800 ++ LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 147968 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 84480 +- LdsOffsetMetadata_Blk: 147968 ++ LdsOffsetMetadata: 76800 ++ LdsOffsetMetadata_Blk: 140288 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -10258,14 +24477,14 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 4] +- MIWaveTileA: 4 +- MIWaveTileB: 4 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 8] ++ MIWaveTileA: 1 ++ MIWaveTileB: 8 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 256 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 +@@ -10278,7 +24497,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -10287,19 +24506,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 32 ++ NumGlobalWriteVectorsPerThread: 32 ++ NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -10314,8 +24533,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 46 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 108 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10324,20 +24543,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 4 +- ThreadTileA: 16 +- ThreadTileB: 4 ++ ThreadTile0: 4 ++ ThreadTile1: 8 ++ ThreadTileA: 4 ++ ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10352,17 +24571,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10381,9 +24600,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10392,24 +24611,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x64_MI16x-tNbycfAN_x8dpCP5BMkEpZA-Al8QRiBixFXu-xkJHI= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16xz1oyCgNfLsIjPf_T13Px02mSJLtdhzxWzNYu4lneCN0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10422,7 +24642,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -10434,7 +24654,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -10444,24 +24664,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 135168 ++ LdsBytesNoAmax: 73728 + LdsInitCVgprs: false +- LdsNumBytes: 135168 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 50688 ++ LdsNumBytes: 73728 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 64512 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 67584 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 84480 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 140288 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 84480 ++ LdsOffsetMetadata: 73728 ++ LdsOffsetMetadata_Blk: 140288 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -10469,8 +24689,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10482,15 +24702,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 3] +- MIWaveTileA: 4 +- MIWaveTileB: 3 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 7] ++ MIWaveTileA: 1 ++ MIWaveTileB: 7 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 32 ++ MacroTile1: 224 ++ MacroTileA: 32 ++ MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10502,8 +24722,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10511,20 +24731,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 12 +- NumLoadsA: 4 +- NumLoadsB: 12 ++ NumElementsPerThread: 28 ++ NumGlobalWriteVectorsPerThread: 28 ++ NumLoadsA: 2 ++ NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 12 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10538,30 +24758,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 47 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 109 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x224x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 4 ++ ThreadTile1: 7 ++ ThreadTileA: 4 ++ ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10576,17 +24796,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10605,9 +24825,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10616,24 +24836,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x0GAL1VwNuv2YH6rxiI3jshuGGKsc_2qYfNHC-MduMdM= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16xT3Mp9FUgmaakeOeKIUGvOErL0vKnj7I5_-uP5BFEHqw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10646,7 +24867,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -10658,7 +24879,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -10668,24 +24889,24 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 116224 ++ LdsBytesNoAmax: 61440 + LdsInitCVgprs: false +- LdsNumBytes: 116224 +- LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 61440 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 ++ LdsOffsetMetadata: 61440 ++ LdsOffsetMetadata_Blk: 74752 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -10693,8 +24914,8 @@ + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -10706,15 +24927,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 6] ++ MIWaveTileA: 1 ++ MIWaveTileB: 6 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 32 ++ MacroTile1: 192 ++ MacroTileA: 32 ++ MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -10726,8 +24947,8 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -10735,20 +24956,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 4 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 24 ++ NumGlobalWriteVectorsPerThread: 24 ++ NumLoadsA: 2 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10762,8 +24983,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 48 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 110 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x192x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10772,20 +24993,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 6 ++ ThreadTileA: 4 ++ ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -10800,17 +25021,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [16, 16, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -10829,9 +25050,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -10840,24 +25061,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI32x30o_o8f0qKaoA9buF3pUAI58bM3rCQq9YEpjxlejCeo0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16xTkQ_qaz0JgTCob0uGHSuBYg3A-8fc3Z0NlCQsW50XNk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -10882,7 +25104,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +@@ -10892,38 +25114,38 @@ + LVCB: 16 + LVPA: 4 + LVPB: 4 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 55296 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 16640 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 55296 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16640 +- LdsOffsetB_Blk: 82176 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16640 +- LdsOffsetMetadata_Blk: 82176 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 55296 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -10931,48 +25153,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] ++ MIWaveTile: [1, 5] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 5 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 160 ++ MacroTileA: 32 ++ MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 +- NumLoadsB: 4 ++ NumElementsPerThread: 20 ++ NumGlobalWriteVectorsPerThread: 20 ++ NumLoadsA: 2 ++ NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -10980,14 +25202,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 49 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 111 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x160x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -10999,17 +25221,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 5 ++ ThreadTileA: 4 ++ ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11031,10 +25253,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11053,9 +25275,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11064,24 +25286,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x3P4385JRZHmKQb8cdOmhovYdfrBfaTlB1fGEqe42TWMs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16xNz2JX8JdINk5doLTctEtJELIq5EjS9XVZXU_OwkLUKY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11106,98 +25329,98 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +- LSPA: 8 +- LSPB: 8 ++ LSPA: 16 ++ LSPB: 16 + LVCA: 16 + LVCB: 16 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 1024 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57728 ++ LdsBytesNoAmax: 43008 + LdsInitCVgprs: false +- LdsNumBytes: 57728 +- LdsNumElementsAlignedA: 8320 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 43008 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8320 +- LdsOffsetB_Blk: 41088 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8320 +- LdsOffsetMetadata_Blk: 41088 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 43008 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 4] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 64 ++ MacroTile1: 128 + MacroTileA: 32 +- MacroTileB: 64 ++ MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 4 ++ NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -11210,8 +25433,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 50 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 112 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11223,17 +25446,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11249,16 +25472,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11277,9 +25500,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11288,24 +25511,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x3ToUewk0GrfhuDKg1tvfQCVlwFEL2xKE029YxBD5Yt8k= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x19dZNMw7k3cLiwW7KDJBveQ_GFf6OF6Fi2DWe4d52jVM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11330,76 +25554,76 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +- LSPA: 8 +- LSPB: 8 ++ LSPA: 16 ++ LSPB: 16 + LVCA: 16 + LVCB: 16 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 57728 ++ LdsBytesNoAmax: 36864 + LdsInitCVgprs: false +- LdsNumBytes: 57728 +- LdsNumElementsAlignedA: 16640 +- LdsNumElementsAlignedB: 8320 ++ LdsNumBytes: 36864 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16640 +- LdsOffsetB_Blk: 49408 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16640 +- LdsOffsetMetadata_Blk: 49408 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 36864 ++ LdsOffsetMetadata_Blk: 74752 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 4 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 32 +- MacroTileA: 64 +- MacroTileB: 32 ++ MacroTile0: 32 ++ MacroTile1: 96 ++ MacroTileA: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -11407,21 +25631,21 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 4 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 2 ++ NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 4 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 6 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -11434,8 +25658,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 51 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 113 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11447,17 +25671,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11479,10 +25703,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11503,7 +25727,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11512,31 +25736,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x3CoUqzgZrTrMYcsdW-WQHfdNRzXwpuyTRTLBBohyfIjI= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x17JHtgnRCBei9p6Gpo4ufJvGR5HRmVm7tKCjze6gP18U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: 0 ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -11554,97 +25779,97 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 +- LSPA: 4 ++ LSPA: 16 + LSPB: 16 +- LVCA: 64 ++ LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 +- LdsBlockSizePerPadB: 256 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 17408 ++ LdsBytesNoAmax: 26624 + LdsInitCVgprs: false +- LdsNumBytes: 17408 +- LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 8704 ++ LdsNumBytes: 26624 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 8704 +- LdsOffsetB_Blk: 41472 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 17408 +- LdsOffsetMetadata_Blk: 41472 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 26624 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 1 +- LoopUnroll: 16 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 64 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 +- NumLoadsB: 2 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 2 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 2 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11653,13 +25878,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 0 ++ PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 52 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 114 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -11671,17 +25896,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -11697,16 +25922,16 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 64 +@@ -11727,7 +25952,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11736,24 +25961,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32tZBbTqsVs9f9OqMJI3uwrG8EuJgZuhR6-JVUCzDyWV0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x1Q-2DF-JoGCZLUhLY_pHhcECLiyc0pDCesQJ7YJVr3F8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11778,36 +26004,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 512 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 135168 ++ LdsBytesNoAmax: 18432 + LdsInitCVgprs: false +- LdsNumBytes: 135168 +- LdsNumElementsAlignedA: 101376 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 18432 ++ LdsNumElementsAlignedA: 9216 ++ LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 101376 +- LdsOffsetB_Blk: 363520 ++ LdsOffsetA_Blk: 32768 ++ LdsOffsetB: 9216 ++ LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 135168 +- LdsOffsetMetadata_Blk: 363520 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 18432 ++ LdsOffsetMetadata_Blk: 41984 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -11815,11 +26041,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -11827,26 +26053,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [3, 1] +- MIWaveTileA: 3 ++ MIWaveTile: [1, 1] ++ MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 192 +- MacroTile1: 64 +- MacroTileA: 192 +- MacroTileB: 64 ++ MacroTile0: 32 ++ MacroTile1: 32 ++ MacroTileA: 32 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -11855,20 +26081,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 48 +- NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 24 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 4 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 2 ++ NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 24 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 2 ++ NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -11876,18 +26102,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 53 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x64x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 115 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -11895,16 +26121,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 48 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 48 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -11927,16 +26153,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -11951,7 +26177,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -11960,24 +26186,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x128_MI3XAa6bmpX-_CP_sWHQ2XYLWEPQoYUskBNL86kRnTwTLs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16xP3kBNBW19scNjJ_uVu1AbrN11-GKPyA_euMht0WTUMQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -11990,7 +26217,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -12002,36 +26229,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 133120 ++ LdsBytesNoAmax: 81408 + LdsInitCVgprs: false +- LdsNumBytes: 133120 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 66560 ++ LdsNumBytes: 81408 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 328704 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 133120 +- LdsOffsetMetadata_Blk: 328704 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 81408 ++ LdsOffsetMetadata_Blk: 198656 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12039,38 +26266,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [2, 2] +- MIWaveTileA: 2 +- MIWaveTileB: 2 ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 3] ++ MIWaveTileA: 4 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 128 +- MacroTileA: 128 +- MacroTileB: 128 ++ MacroTile0: 256 ++ MacroTile1: 48 ++ MacroTileA: 256 ++ MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12079,20 +26306,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 64 +- NumGlobalWriteVectorsPerThread: 32 ++ NumElementsPerThread: 48 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12106,30 +26333,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 54 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 116 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 32 +- ThreadTile1: 2 +- ThreadTileA: 32 +- ThreadTileB: 2 ++ ThreadTile0: 16 ++ ThreadTile1: 3 ++ ThreadTileA: 16 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12144,8 +26371,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 +- VectorWidthB: 2 ++ VectorWidthA: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -12153,14 +26380,14 @@ + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12175,7 +26402,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12184,24 +26411,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16ivlDldau-ihMer6VR6cMhqqjXcmWxL48sIoW9BnqzsQ= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16xpx4h0Bd_Jt_dZJCjh8c6TN2WW_5VWcY2O6apIGZEcPE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12226,34 +26454,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 100352 ++ LdsBytesNoAmax: 72192 + LdsInitCVgprs: false +- LdsNumBytes: 100352 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 33792 ++ LdsNumBytes: 72192 ++ LdsNumElementsAlignedA: 67584 ++ LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 197632 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 100352 +- LdsOffsetMetadata_Blk: 197632 ++ LdsOffsetMetadata: 72192 ++ LdsOffsetMetadata_Blk: 198656 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -12263,8 +26491,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -12274,15 +26502,15 @@ + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [4, 2] ++ MIWaveGroup: [4, 1] ++ MIWaveTile: [4, 1] + MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 128 +- MacroTile1: 64 +- MacroTileA: 128 +- MacroTileB: 64 ++ MacroTile0: 256 ++ MacroTile1: 16 ++ MacroTileA: 256 ++ MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -12294,7 +26522,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12303,20 +26531,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 +- NumLoadsB: 8 ++ NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12330,12 +26558,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 55 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 117 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT256x16x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12343,17 +26571,17 @@ + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 8 +- SubGroup1: 32 +- SubGroupA: 8 +- SubGroupB: 32 ++ StreamKXCCMapping: 8 ++ SubGroup0: 16 ++ SubGroup1: 16 ++ SubGroupA: 16 ++ SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 16 +- ThreadTile1: 2 ++ ThreadTile1: 1 + ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12369,22 +26597,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 8, 1] ++ WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12399,7 +26627,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12408,24 +26636,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32o1kA-nZimgd_sm33YPRDsSpixsj1sONTt-I-JcptFUo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16xZVnjzI37fxEhqyuJ9Va3NfbXp8aILrjm90k7HpONVgE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12450,36 +26679,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 512 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 135168 ++ LdsBytesNoAmax: 81408 + LdsInitCVgprs: false +- LdsNumBytes: 135168 +- LdsNumElementsAlignedA: 33792 +- LdsNumElementsAlignedB: 101376 ++ LdsNumBytes: 81408 ++ LdsNumElementsAlignedA: 13824 ++ LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 33792 +- LdsOffsetB_Blk: 295936 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 13824 ++ LdsOffsetB_Blk: 144896 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 135168 +- LdsOffsetMetadata_Blk: 295936 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 81408 ++ LdsOffsetMetadata_Blk: 144896 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -12487,38 +26716,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 2] +- MIWaveTile: [1, 3] +- MIWaveTileA: 1 +- MIWaveTileB: 3 ++ MIWaveGroup: [1, 4] ++ MIWaveTile: [3, 4] ++ MIWaveTileA: 3 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 192 +- MacroTileA: 64 +- MacroTileB: 192 ++ MacroTile0: 48 ++ MacroTile1: 256 ++ MacroTileA: 48 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12527,20 +26756,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 +- NumLoadsA: 8 +- NumLoadsB: 24 ++ NumLoadsA: 3 ++ NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 24 ++ NumLoadsPerpendicularA: 3 ++ NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -12548,18 +26777,18 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 56 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x192x128_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 118 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT48x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -12567,17 +26796,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 3 +- ThreadTileA: 16 +- ThreadTileB: 3 ++ ThreadTile0: 12 ++ ThreadTile1: 4 ++ ThreadTileA: 12 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12593,22 +26822,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12623,7 +26852,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12632,24 +26861,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16bvQpXKKYOGGoG9OXAxwUC4ACQ9aLN-VCPSHG0U4RC80= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16xofcQCDJRsUaRczkPy0HigXr5BRtE3yrZ5VLVSMkvfvc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 128 +- DirectToLds: 0 ++ DepthU: 64 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12662,7 +26892,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -12674,34 +26904,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: false +- LSCA: 128 +- LSCB: 128 +- LSPA: 8 +- LSPB: 8 +- LVCA: 32 +- LVCB: 32 +- LVPA: 2 +- LVPB: 2 +- LdsBlockSizePerPadA: 2048 ++ LSCA: 64 ++ LSCB: 64 ++ LSPA: 16 ++ LSPB: 16 ++ LVCA: 16 ++ LVCB: 16 ++ LVPA: 4 ++ LVPB: 4 ++ LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 100864 ++ LdsBytesNoAmax: 72192 + LdsInitCVgprs: false +- LdsNumBytes: 100864 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 72192 ++ LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 164352 ++ LdsOffsetB: 4608 ++ LdsOffsetB_Blk: 135680 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 100864 +- LdsOffsetMetadata_Blk: 164352 ++ LdsOffsetMetadata: 72192 ++ LdsOffsetMetadata_Blk: 135680 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -12711,8 +26941,8 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 4 +- LoopUnroll: 128 ++ LoopIters: 2 ++ LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] +@@ -12723,14 +26953,14 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] +- MIWaveTile: [4, 2] +- MIWaveTileA: 4 +- MIWaveTileB: 2 ++ MIWaveTile: [1, 4] ++ MIWaveTileA: 1 ++ MIWaveTileB: 4 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 128 +- MacroTileA: 64 +- MacroTileB: 128 ++ MacroTile0: 16 ++ MacroTile1: 256 ++ MacroTileA: 16 ++ MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 +@@ -12742,7 +26972,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -12751,19 +26981,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 32 +- NumGlobalWriteVectorsPerThread: 8 +- NumLoadsA: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 16 ++ NumLoadsA: 1 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -12778,30 +27008,30 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 57 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 119 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 512 ++ StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 2 +- ThreadTileA: 16 +- ThreadTileB: 2 ++ ThreadTile0: 4 ++ ThreadTile1: 4 ++ ThreadTileA: 4 ++ ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -12816,8 +27046,8 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 +- VectorWidthB: 2 ++ VectorWidthA: 1 ++ VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -12825,14 +27055,14 @@ + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 128 +- _DepthUA: 128 +- _DepthUB: 128 +- _DepthUMetadata: 128 ++ _DepthU: 64 ++ _DepthUA: 64 ++ _DepthUB: 64 ++ _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -12845,9 +27075,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -12856,24 +27086,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32xt92LodYfKN-WXbIu7wkVwdMNLK46YLjmEQ7eo5KiCwE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16xxD28AfciTuBUhGnN2gCuZGzBc3xSVc5KrHncMZXMadU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -12898,7 +27129,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +@@ -12908,38 +27139,38 @@ + LVCB: 32 + LVPA: 2 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 133120 ++ LdsBytesNoAmax: 69632 + LdsInitCVgprs: false +- LdsNumBytes: 133120 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 52224 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66560 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99840 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 52224 ++ LdsOffsetB_Blk: 183296 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99840 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 183296 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -12947,48 +27178,48 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveTile: [3, 1] ++ MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 +- MacroTile0: 64 +- MacroTile1: 64 +- MacroTileA: 64 +- MacroTileB: 64 ++ MacroTile0: 96 ++ MacroTile1: 32 ++ MacroTileA: 96 ++ MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 ++ NumLoadsA: 12 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 12 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13002,29 +27233,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 58 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 120 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT96x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 12 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13047,10 +27278,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13069,9 +27300,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13080,24 +27311,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32xMQEp964Lg9F2MpI8jzwkit6RXXRvLY0K9U02KLHHohU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16xT3DOSFx8Ccuyb49J1k-SWhs4SkyyTifP3jTrpIuWSx4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13110,7 +27342,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -13122,36 +27354,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +- LSPA: 4 +- LSPB: 4 ++ LSPA: 8 ++ LSPB: 8 + LVCA: 32 + LVCB: 32 +- LVPA: 1 +- LVPB: 1 +- LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 512 ++ LVPA: 2 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 116224 ++ LdsBytesNoAmax: 67584 + LdsInitCVgprs: false +- LdsNumBytes: 116224 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 82432 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 +- LdsOffsetMetadata_Blk: 82432 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -13159,38 +27391,38 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 8 ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 2] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 2] ++ MIWaveTileA: 2 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 32 ++ MacroTile0: 64 + MacroTile1: 64 +- MacroTileA: 32 ++ MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -13199,21 +27431,21 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 ++ NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 +- NumLoadsB: 16 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 16 +- NumThreads: 128 ++ NumLoadsPerpendicularB: 8 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] +@@ -13226,8 +27458,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 59 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 121 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13236,20 +27468,20 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 +- SubGroup1: 64 +- SubGroupA: 2 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 8 ++ ThreadTile1: 2 ++ ThreadTileA: 8 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13264,17 +27496,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthA: 2 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13293,9 +27525,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13304,24 +27536,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32xCUJ8XLbzB2Ojm1tiRT0zS1mZfWuWIpNX5h9BBExYHeU= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x-1VHSiMXiu4bKW6ZItXHRciEcXQ-QHHQ68TJidK3Jnk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13334,7 +27567,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 1 ++ GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -13346,57 +27579,57 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +- LSPA: 4 +- LSPB: 4 ++ LSPA: 8 ++ LSPB: 8 + LVCA: 32 + LVCB: 32 +- LVPA: 1 +- LVPB: 1 ++ LVPA: 2 ++ LVPB: 2 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115456 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 115456 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 99328 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 8 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [2, 1] +- MIWaveTile: [1, 1] +- MIWaveTileA: 1 ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [2, 1] ++ MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 +@@ -13408,14 +27641,14 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -13423,35 +27656,35 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 8 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 4 ++ NumLoadsA: 8 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 8 +- NumThreads: 128 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 4 ++ NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 60 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 122 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13460,19 +27693,19 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 1 ++ StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 4 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 8 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -13488,17 +27721,17 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 1 ++ VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 2, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13517,9 +27750,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13528,24 +27761,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32xzib3Bajt0ypE0FYFgYCiUlcsA6gtPbHu2bOdtmjDYdM= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x-RYgdtF_GlQWov0yHjgCWjU5Y1kmkUE3Iw17NrK5k-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13570,7 +27804,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +@@ -13580,87 +27814,87 @@ + LVCB: 32 + LVPA: 2 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 98816 ++ LdsBytesNoAmax: 69632 + LdsInitCVgprs: false +- LdsNumBytes: 98816 +- LdsNumElementsAlignedA: 16640 +- LdsNumElementsAlignedB: 16640 ++ LdsNumBytes: 69632 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 52224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 65536 +- LdsOffsetB: 16640 +- LdsOffsetB_Blk: 82176 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16640 +- LdsOffsetMetadata_Blk: 82176 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 69632 ++ LdsOffsetMetadata_Blk: 148480 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true +- LoopIters: 2 +- LoopUnroll: 32 ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] +- MIWaveTile: [1, 1] ++ MIWaveGroup: [2, 2] ++ MIWaveTile: [1, 3] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 +- MacroTile1: 32 ++ MacroTile1: 96 + MacroTileA: 32 +- MacroTileB: 32 ++ MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 +- NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 4 ++ NumElementsPerThread: 12 ++ NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 +- NumLoadsB: 4 ++ NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 4 ++ NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13668,14 +27902,14 @@ + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] +- PrefetchGlobalRead: 1 ++ PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 61 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 123 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x96x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -13687,17 +27921,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 3 ++ ThreadTileA: 4 ++ ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13719,10 +27953,10 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -13743,7 +27977,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13752,24 +27986,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32xETshwJJGUOBs3HipPWnS4a64hmTABTw5Crxpm8wRL4o= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x2vAfVfhumbo7JIL5YPAkZaISpYHS4fmRlE-Zl2b9ENY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: 0 ++ DepthU: 128 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -13794,36 +28029,36 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 +- LSCB: 256 +- LSPA: 4 +- LSPB: 4 +- LVCA: 64 +- LVCB: 64 +- LVPA: 1 +- LVPB: 1 +- LdsBlockSizePerPadA: 1024 ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 133120 ++ LdsBytesNoAmax: 51200 + LdsInitCVgprs: false +- LdsNumBytes: 133120 +- LdsNumElementsAlignedA: 66560 +- LdsNumElementsAlignedB: 66560 ++ LdsNumBytes: 51200 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66560 +- LdsOffsetB_Blk: 328704 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 133120 +- LdsOffsetMetadata_Blk: 328704 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 51200 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 +@@ -13831,11 +28066,11 @@ + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false +- LoopIters: 16 +- LoopUnroll: 256 ++ LoopIters: 4 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 +@@ -13843,26 +28078,26 @@ + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] +- MIWaveTile: [1, 1] ++ MIWaveTile: [1, 2] + MIWaveTileA: 1 +- MIWaveTileB: 1 ++ MIWaveTileB: 2 + MIWaveTileMetadata: 0 +- MacroTile0: 64 ++ MacroTile0: 32 + MacroTile1: 64 +- MacroTileA: 64 ++ MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -13871,20 +28106,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 +- NumElementsPerThread: 16 +- NumGlobalWriteVectorsPerThread: 16 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumElementsPerBatchStore: 0 ++ NumElementsPerThread: 8 ++ NumGlobalWriteVectorsPerThread: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -13898,12 +28133,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 62 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x256_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 124 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -13911,17 +28146,17 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 4 +- SubGroup1: 64 +- SubGroupA: 4 +- SubGroupB: 64 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 ++ SubGroup1: 32 ++ SubGroupA: 8 ++ SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 +- ThreadTile1: 1 +- ThreadTileA: 16 +- ThreadTileB: 1 ++ ThreadTile0: 4 ++ ThreadTile1: 2 ++ ThreadTileA: 4 ++ ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 +@@ -13937,22 +28172,22 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 1 ++ VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [64, 4, 1] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -13965,9 +28200,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -13976,24 +28211,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32xF0IDKpT9qNEZ8taHX__2g8QNElTrGFOmXHaku6DNACo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x_DhK6bvoU31GRoa_IALnmROhTvTr9DjcG9Aebjue5L4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false +- CustomKernelName: '' +- DebugStreamK: 0 +- DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ CustomKernelName: '' ++ DebugStreamK: 0 ++ DepthU: 128 ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14018,55 +28254,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 256 +- LSCB: 256 +- LSPA: 4 +- LSPB: 4 +- LVCA: 64 +- LVCB: 64 +- LVPA: 1 +- LVPB: 1 +- LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 1024 ++ LSCA: 128 ++ LSCB: 128 ++ LSPA: 8 ++ LSPB: 8 ++ LVCA: 32 ++ LVCB: 32 ++ LVPA: 2 ++ LVPB: 2 ++ LdsBlockSizePerPadA: 512 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 133120 ++ LdsBytesNoAmax: 34816 + LdsInitCVgprs: false +- LdsNumBytes: 133120 +- LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 33280 ++ LdsNumBytes: 34816 ++ LdsNumElementsAlignedA: 17408 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66560 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99840 ++ LdsOffsetA_Blk: 65536 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99840 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 34816 ++ LdsOffsetMetadata_Blk: 82944 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 4 +- LoopUnroll: 64 ++ LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14080,14 +28316,14 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false +@@ -14095,20 +28331,20 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 +- NumLoadsB: 8 ++ NumLoadsA: 4 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 +- NumLoadsPerpendicularB: 8 ++ NumLoadsPerpendicularA: 4 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14122,29 +28358,29 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 63 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x256_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 125 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 1024 ++ StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14167,16 +28403,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 256 +- _DepthUA: 256 +- _DepthUB: 256 +- _DepthUMetadata: 256 ++ _DepthU: 128 ++ _DepthUA: 128 ++ _DepthUB: 128 ++ _DepthUMetadata: 128 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14191,7 +28427,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14200,24 +28436,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32xeQSEp_t5uFDmjQ-5DkLsfT4NXSFjQSOvoxcHELlHcM8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x9XTkM0Tk_ys5M0gfN4Kfqk9aGJpXt8GsCI3gKoFJAYc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 +- DepthU: 512 +- DirectToLds: 0 ++ DepthU: 256 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14242,55 +28479,55 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: false +- LSCA: 512 +- LSCB: 512 +- LSPA: 2 +- LSPB: 2 +- LVCA: 128 +- LVCB: 128 ++ LSCA: 256 ++ LSCB: 256 ++ LSPA: 4 ++ LSPB: 4 ++ LVCA: 64 ++ LVCB: 64 + LVPA: 1 + LVPB: 1 +- LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 132096 ++ LdsBytesNoAmax: 67584 + LdsInitCVgprs: false +- LdsNumBytes: 132096 +- LdsNumElementsAlignedA: 66048 +- LdsNumElementsAlignedB: 66048 ++ LdsNumBytes: 67584 ++ LdsNumElementsAlignedA: 33792 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 262144 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 328192 ++ LdsOffsetA_Blk: 131072 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 132096 +- LdsOffsetMetadata_Blk: 328192 +- LdsPadA: 4 +- LdsPadB: 4 ++ LdsOffsetMetadata: 67584 ++ LdsOffsetMetadata_Blk: 164864 ++ LdsPadA: 8 ++ LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 +- LocalSplitU: 4 ++ LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 +- LoopUnroll: 128 ++ LoopUnroll: 256 + MFMA_BF16_1K: false + MIArchVgpr: false +- MIBlock: [32, 32, 16, 1, 1, 1] ++ MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 +- MIWaveGroup: [1, 1] ++ MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 +@@ -14304,35 +28541,35 @@ + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 +- MatrixInstK: 16 +- MatrixInstM: 32 +- MatrixInstN: 32 +- MatrixInstruction: [32, 32, 16, 1] ++ MatrixInstK: 32 ++ MatrixInstM: 16 ++ MatrixInstN: 16 ++ MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsA: 8 ++ NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -14346,12 +28583,12 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 64 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x512_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 126 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +- StaggerUStride: 2048 ++ StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false +@@ -14359,16 +28596,16 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 +- SubGroup0: 2 ++ StreamKXCCMapping: 8 ++ SubGroup0: 8 + SubGroup1: 32 +- SubGroupA: 2 ++ SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] +- ThreadTile0: 16 ++ ThreadTile0: 4 + ThreadTile1: 1 +- ThreadTileA: 16 ++ ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true +@@ -14391,16 +28628,16 @@ + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 +- WorkGroup: [32, 2, 4] ++ WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] +- _DepthU: 512 +- _DepthUA: 512 +- _DepthUB: 512 +- _DepthUMetadata: 512 ++ _DepthU: 256 ++ _DepthUA: 256 ++ _DepthUB: 256 ++ _DepthUMetadata: 256 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 +@@ -14415,7 +28652,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14424,24 +28661,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16xX2cMVMz12xNrABuv0bAZtTAngS8ioQAafBBQDKR_o-M= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x6b47IKrTodXGBdvcsZ5bkuE9LUtxeAl66Q_mlHQHTdQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14466,7 +28704,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +@@ -14534,7 +28772,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -14542,13 +28780,13 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 +@@ -14565,13 +28803,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 65 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 127 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14583,7 +28821,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14617,8 +28855,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14639,7 +28877,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14648,24 +28886,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16xBky-aeRuiG8iVkZ9cnpS76hSbXK-IXKaS39NUSv7xE8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16xrMGZtpX5PkQytOJG3dODpWIrVUCmBNHEHXGGx5bK1a8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14678,7 +28917,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -14690,7 +28929,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +@@ -14700,24 +28939,24 @@ + LVCB: 32 + LVPA: 2 + LVPB: 2 +- LdsBlockSizePerPadA: 1024 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 25600 ++ LdsBytesNoAmax: 26112 + LdsInitCVgprs: false +- LdsNumBytes: 25600 +- LdsNumElementsAlignedA: 16896 ++ LdsNumBytes: 26112 ++ LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +- LdsOffsetB: 16896 +- LdsOffsetB_Blk: 49664 ++ LdsOffsetB: 17408 ++ LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 25600 +- LdsOffsetMetadata_Blk: 49664 ++ LdsOffsetMetadata: 26112 ++ LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -14758,23 +28997,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -14789,13 +29028,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 66 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 128 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -14804,10 +29043,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -14832,7 +29071,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -14841,8 +29080,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -14863,7 +29102,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -14872,24 +29111,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16xogklnvnUKE9DazG3mgbaU_-bL8229dzBSOqmLR8cvyk= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16xymCyUU-NTZ_-63qUJUtaC8sXIkpJJGOOdhwEVy5wRk4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -14902,7 +29142,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -14914,7 +29154,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +@@ -14924,24 +29164,24 @@ + LVCB: 32 + LVPA: 2 + LVPB: 2 +- LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 41984 ++ LdsBytesNoAmax: 43520 + LdsInitCVgprs: false +- LdsNumBytes: 41984 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 43520 ++ LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 34816 ++ LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 41984 +- LdsOffsetMetadata_Blk: 98816 ++ LdsOffsetMetadata: 43520 ++ LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -14982,23 +29222,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 +@@ -15013,13 +29253,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 67 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 129 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15028,10 +29268,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15056,7 +29296,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15065,8 +29305,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15085,9 +29325,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15096,31 +29336,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x67UshOwJR_kocT5DU8jXcpCWNZlABQU3EyDe5rC2z-Q= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16xQ3WD5KCWKQhgKlrX80fWAB05AtCQXjshpV4UF_6RPRk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -15138,24 +29379,24 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +- LSPA: 2 ++ LSPA: 8 + LSPB: 8 +- LVCA: 128 ++ LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 1024 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 58368 ++ LdsBytesNoAmax: 26112 + LdsInitCVgprs: false +- LdsNumBytes: 58368 ++ LdsNumBytes: 26112 + LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 16896 ++ LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 +@@ -15164,7 +29405,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 ++ LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 +@@ -15206,7 +29447,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -15214,20 +29455,20 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 +- NumLoadsA: 8 ++ NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -15237,13 +29478,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 68 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 130 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15255,7 +29496,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15281,7 +29522,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15289,8 +29530,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15309,9 +29550,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15320,31 +29561,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16x5ihU4PJTyHTqqvFs0qJ2tfkVxAqnMAYRssHvBE2wVk0= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16xbSFrBgYP13gsk2bvMpI41SbsIY1jf5NJxFHe5hxzrxc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -15362,24 +29604,24 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 128 + LSCB: 128 +- LSPA: 2 ++ LSPA: 8 + LSPB: 8 +- LVCA: 128 ++ LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 107520 ++ LdsBytesNoAmax: 43520 + LdsInitCVgprs: false +- LdsNumBytes: 107520 ++ LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 +- LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -15388,7 +29630,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 8704 ++ LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 +@@ -15397,8 +29639,8 @@ + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false +@@ -15430,28 +29672,28 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalA: 0 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 +- NumLoadsA: 8 ++ NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 8 ++ NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -15461,13 +29703,13 @@ + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 +- PrefetchLocalRead: 1 ++ PrefetchLocalRead: 0 + PreloadKernArgs: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 69 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA4_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 131 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15479,7 +29721,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15505,7 +29747,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -15513,8 +29755,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 128 +@@ -15533,9 +29775,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15544,24 +29786,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16xWtydklDnBQ_B7YYXr3KVgmGxPQcOretEqo-aKYx7rfs= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x8Tuw9IcBCLRGXtpUcdJq9z9BcborzCrg79clJid8N54= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -15569,7 +29812,7 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false +@@ -15586,22 +29829,22 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 +- LSPB: 1 ++ LSPB: 4 + LVCA: 64 +- LVCB: 256 ++ LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 33792 + LdsInitCVgprs: false +- LdsNumBytes: 99328 ++ LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 +@@ -15612,7 +29855,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 ++ LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 +@@ -15621,8 +29864,8 @@ + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -15654,17 +29897,17 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -15672,11 +29915,11 @@ + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 +- NumLoadsB: 16 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -15690,8 +29933,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 70 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 132 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15703,7 +29946,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15737,8 +29980,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15757,9 +30000,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15768,37 +30011,38 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16xOthMhdzofq5Ai4qnr6ocO8PBCIYRK46xitHqMZd47F8= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16xmuKOIgE0wslcNc_sqx4oyw4vGRvoLyoG5WRPnRPb6Q0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -15810,34 +30054,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 +- LSPA: 1 ++ LSPA: 4 + LSPB: 4 +- LVCA: 256 ++ LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 +- LdsBlockSizePerPadA: 2048 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115712 ++ LdsBytesNoAmax: 50688 + LdsInitCVgprs: false +- LdsNumBytes: 115712 +- LdsNumElementsAlignedA: 33280 ++ LdsNumBytes: 50688 ++ LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +- LdsOffsetB: 33280 +- LdsOffsetB_Blk: 98816 ++ LdsOffsetB: 33792 ++ LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 98816 ++ LdsOffsetMetadata: 50688 ++ LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -15845,8 +30089,8 @@ + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -15878,28 +30122,28 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 +- NumLoadsA: 32 ++ NumGlobalWriteVectorsPerThread: 2 ++ NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 32 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -15914,8 +30158,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 71 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 133 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -15924,10 +30168,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -15952,7 +30196,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -15961,8 +30205,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -15983,7 +30227,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -15992,24 +30236,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16xrafy55ozbpreSdbUPz6492hTeNSRyM--gWHFRO1AJfM= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16xdqCpbTaaU_QXu3LDCZuht-YtBNKCAOvMTsLpNKo2S5M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16017,12 +30262,12 @@ + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 +- GlobalReadVectorWidthB: 1 ++ GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 4 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -16034,34 +30279,34 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 +- LSPB: 1 ++ LSPB: 4 + LVCA: 64 +- LVCB: 256 ++ LVCB: 64 + LVPA: 1 + LVPB: 1 +- LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82944 ++ LdsBytesNoAmax: 84480 + LdsInitCVgprs: false +- LdsNumBytes: 82944 +- LdsNumElementsAlignedA: 66048 ++ LdsNumBytes: 84480 ++ LdsNumElementsAlignedA: 67584 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 197120 ++ LdsOffsetB: 67584 ++ LdsOffsetB_Blk: 198656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82944 +- LdsOffsetMetadata_Blk: 197120 ++ LdsOffsetMetadata: 84480 ++ LdsOffsetMetadata_Blk: 198656 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -16102,29 +30347,29 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 +- NumLoadsB: 16 ++ NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 +- NumLoadsPerpendicularB: 16 ++ NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 +@@ -16138,8 +30383,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 72 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW4_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 134 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT64x16x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16148,10 +30393,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 4 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16176,7 +30421,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 4 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -16185,8 +30430,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16205,9 +30450,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16216,24 +30461,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16xyCCcfs_lsxTb6OvFiYzUJGj2MrYAlrIVhvwebvZY8io= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x18QTxhE42xHUR_atRyKrXdrtlDhjSg-cwxnbdan2vO4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16258,7 +30504,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 +@@ -16269,13 +30515,13 @@ + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 2048 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 115712 ++ LdsBytesNoAmax: 50688 + LdsInitCVgprs: false +- LdsNumBytes: 115712 ++ LdsNumBytes: 50688 + LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 33280 ++ LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 +@@ -16284,7 +30530,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 16896 ++ LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 +@@ -16293,8 +30539,8 @@ + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false +@@ -16326,17 +30572,17 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -16362,8 +30608,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 73 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 135 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16375,7 +30621,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16401,7 +30647,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16409,8 +30655,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16431,7 +30677,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16440,24 +30686,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16xKFlAanmhduVXq5b2l3uwvVHmzSqorK_l4mfmYtXi2pE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16xl1ryk09kO_iD_gvC6LPHnHMlsSzDNteOQYQyRf0AZbc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16482,7 +30729,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 256 + LSCB: 256 +@@ -16493,13 +30740,13 @@ + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 +- LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 82944 ++ LdsBytesNoAmax: 84480 + LdsInitCVgprs: false +- LdsNumBytes: 82944 ++ LdsNumBytes: 84480 + LdsNumElementsAlignedA: 16896 +- LdsNumElementsAlignedB: 66048 ++ LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -16508,7 +30755,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 82944 ++ LdsOffsetMetadata: 84480 + LdsOffsetMetadata_Blk: 147968 + LdsPadA: 8 + LdsPadB: 8 +@@ -16550,7 +30797,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -16558,9 +30805,9 @@ + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +@@ -16586,8 +30833,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 74 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 136 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x64x256_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -16599,7 +30846,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16625,7 +30872,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 4 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -16633,8 +30880,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 256 +@@ -16653,9 +30900,9 @@ + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- - 1LDSBuffer: 0 ++ - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16664,24 +30911,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16xFOB38I69YtIGEGQf2vy0fCC35qQF0f58X2aUtFuXgjo= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16xf2lzpqvshEyUl8i82obA9DKHWPjvrIoew9EKGdEECsw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: true +- DirectToLdsA: true +- DirectToLdsB: true ++ DirectToLds: false ++ DirectToLdsA: false ++ DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16706,7 +30954,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 +@@ -16719,21 +30967,21 @@ + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 133120 ++ LdsBytesNoAmax: 66560 + LdsInitCVgprs: false +- LdsNumBytes: 133120 ++ LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 +- LdsOffsetA_Blk: 66560 ++ LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 +- LdsOffsetB_Blk: 99840 ++ LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 33280 +- LdsOffsetMetadata_Blk: 99840 ++ LdsOffsetMetadata: 66560 ++ LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -16741,8 +30989,8 @@ + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 +- LocalWriteUseSgprA: true +- LocalWriteUseSgprB: true ++ LocalWriteUseSgprA: false ++ LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false +@@ -16774,21 +31022,21 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 +- NoLdsWriteCode: true ++ MbskPrefetchOpt: 0 ++ NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 +- NonTemporalB: 4 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalB: 0 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 +@@ -16810,20 +31058,20 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 75 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 137 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 2048 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 +- StoreSwapAddr: true ++ StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -16857,8 +31105,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -16879,7 +31127,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -16888,24 +31136,25 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16xBrccpYC-A0euFRmbNfu5HGyEaRVDT-8QxfSFBF0y9wc= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x-gs85QPVkOb8g9iN1wA-T20QdKUhgPmUqMAJT1OAfo4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: false ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 +@@ -16918,7 +31167,7 @@ + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false +- GlobalWriteVectorWidth: 2 ++ GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true +@@ -16930,7 +31179,7 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false + LSCA: 512 + LSCB: 512 +@@ -16940,24 +31189,24 @@ + LVCB: 128 + LVPA: 1 + LVPB: 1 +- LdsBlockSizePerPadA: 4096 ++ LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 99840 + LdsInitCVgprs: false +- LdsNumBytes: 99328 +- LdsNumElementsAlignedA: 66048 ++ LdsNumBytes: 99840 ++ LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +- LdsOffsetB: 66048 +- LdsOffsetB_Blk: 197120 ++ LdsOffsetB: 66560 ++ LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 +- LdsOffsetMetadata_Blk: 197120 ++ LdsOffsetMetadata: 99840 ++ LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 +@@ -16998,23 +31247,23 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 +- NonTemporalA: 4 ++ NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 +- NumElementsPerBatchStore: 16 ++ NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 +- NumGlobalWriteVectorsPerThread: 1 ++ NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 +@@ -17034,8 +31283,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 76 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA4096_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW2_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 138 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT32x16x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17044,10 +31293,10 @@ + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 +- StoreVectorWidth: 2 ++ StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17072,7 +31321,7 @@ + UseSgprForGRO: 0 + Valid: true + VectorStore: -1 +- VectorWidthA: 2 ++ VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 +@@ -17081,8 +31330,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +@@ -17103,7 +31352,7 @@ + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false +- ActivationFuncCall: false ++ ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 +@@ -17112,31 +31361,32 @@ + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true +- BaseName: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16xaeONEdi_9_IST0QcKX-8Nrl3RHfnEA3EwO5r3Ac7CuE= ++ BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16xP5a5-R6MLCObPaIshiyXZqEb7ARzw057lHBIqwkwGkg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 +- ClusterLocalRead: true ++ ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 512 +- DirectToLds: 0 ++ DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr ++ EnableF32XEmulationLds: false + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 +- GlobalReadVectorWidthA: 1 ++ GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer +@@ -17154,24 +31404,24 @@ + SupportUserGSU: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly +- KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB4096_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 ++ KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: false +- LSCA: 256 ++ LSCA: 512 + LSCB: 512 +- LSPA: 1 ++ LSPA: 2 + LSPB: 2 +- LVCA: 256 ++ LVCA: 128 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 +- LdsBlockSizePerPadB: 4096 ++ LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 +- LdsBytesNoAmax: 99328 ++ LdsBytesNoAmax: 99840 + LdsInitCVgprs: false +- LdsNumBytes: 99328 ++ LdsNumBytes: 99840 + LdsNumElementsAlignedA: 33280 +- LdsNumElementsAlignedB: 66048 ++ LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 +@@ -17180,7 +31430,7 @@ + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 +- LdsOffsetMetadata: 99328 ++ LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 +@@ -17222,7 +31472,7 @@ + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 +- MbskPrefetchMethod: 0 ++ MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false +@@ -17231,19 +31481,19 @@ + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 +- NonTemporalC: 4 +- NonTemporalD: 4 ++ NonTemporalC: 0 ++ NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 +- NumLoadsA: 32 ++ NumLoadsA: 8 + NumLoadsB: 16 +- NumLoadsCoalescedA: 2 ++ NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 +- NumLoadsPerpendicularA: 16 ++ NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 +@@ -17258,8 +31508,8 @@ + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 +- SolutionIndex: 77 +- SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTL0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB4096_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA2_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCG304 ++ SolutionIndex: 139 ++ SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HAS_SAV_UserArgs_MT16x32x512_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS2048_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC1_WGMXCCGn1 + SourceSwap: true + StaggerU: 0 + StaggerUMapping: 0 +@@ -17271,7 +31521,7 @@ + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 +- StreamKXCCMapping: 0 ++ StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 +@@ -17297,7 +31547,7 @@ + Valid: true + VectorStore: -1 + VectorWidthA: 1 +- VectorWidthB: 2 ++ VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 +@@ -17305,8 +31555,8 @@ + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 +- WorkGroupMappingXCC: 8 +- WorkGroupMappingXCCGroup: 304 ++ WorkGroupMappingXCC: 1 ++ WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 512 +-- +2.43.0 + diff --git a/rocm-libraries b/rocm-libraries index 4887d215159..4336ac10888 160000 --- a/rocm-libraries +++ b/rocm-libraries @@ -1 +1 @@ -Subproject commit 4887d215159d2a36a60e8e20d81c5d7a7bf7eeb7 +Subproject commit 4336ac10888de969c34d214c2ee353a6062d89fb